diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 16b6e71498..d1d09223db 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -7,6 +7,10 @@ inputs: type: boolean required: false default: false + aws_oicd_role_arn: + description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' + required: false + default: '' outputs: base-url: @@ -79,6 +83,14 @@ runs: ALLURE_VERSION: 2.27.0 ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 + - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test + if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-duration-seconds: 3600 # 1 hour should be more than enough to upload report + # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock shell: bash -euxo pipefail {0} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index df4a6712ac..9c376f420a 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -8,6 +8,10 @@ inputs: unique-key: description: 'string to distinguish different results in the same run' required: true + aws_oicd_role_arn: + description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' + required: false + default: '' runs: using: "composite" @@ -31,6 +35,14 @@ runs: env: REPORT_DIR: ${{ inputs.report-dir }} + - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test + if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-duration-seconds: 3600 # 1 hour should be more than enough to upload report + - name: Upload test results shell: bash -euxo pipefail {0} run: | diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 037b9aeb1e..275f161019 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -48,6 +48,10 @@ inputs: description: 'benchmark durations JSON' required: false default: '{}' + aws_oicd_role_arn: + description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' + required: false + default: '' runs: using: "composite" @@ -222,6 +226,13 @@ runs: # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true + - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test + if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml new file mode 100644 index 0000000000..cc6994397f --- /dev/null +++ b/.github/workflows/_create-release-pr.yml @@ -0,0 +1,79 @@ +name: Create Release PR + +on: + workflow_call: + inputs: + component-name: + description: 'Component name' + required: true + type: string + release-branch: + description: 'Release branch' + required: true + type: string + secrets: + ci-access-token: + description: 'CI access token' + required: true + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + create-storage-release-branch: + runs-on: ubuntu-22.04 + + permissions: + contents: write # for `git push` + + steps: + - uses: actions/checkout@v4 + with: + ref: main + + - name: Set variables + id: vars + env: + COMPONENT_NAME: ${{ inputs.component-name }} + RELEASE_BRANCH: ${{ inputs.release-branch }} + run: | + today=$(date +'%Y-%m-%d') + echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT} + echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT} + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Create RC branch + env: + RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} + TITLE: ${{ steps.vars.outputs.title }} + run: | + git checkout -b "${RC_BRANCH}" + + # create an empty commit to distinguish workflow runs + # from other possible releases from the same commit + git commit --allow-empty -m "${TITLE}" + + git push origin "${RC_BRANCH}" + + - name: Create a PR into ${{ inputs.release-branch }} + env: + GH_TOKEN: ${{ secrets.ci-access-token }} + RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} + RELEASE_BRANCH: ${{ inputs.release-branch }} + TITLE: ${{ steps.vars.outputs.title }} + run: | + cat << EOF > body.md + ## ${TITLE} + + **Please merge this Pull Request using 'Create a merge commit' button** + EOF + + gh pr create --title "${TITLE}" \ + --body-file "body.md" \ + --head "${RC_BRANCH}" \ + --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 69b8bc5d70..2ad1ee0a42 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -122,6 +122,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests @@ -133,6 +134,7 @@ jobs: --ignore test_runner/performance/test_perf_pgvector_queries.py --ignore test_runner/performance/test_logical_replication.py --ignore test_runner/performance/test_physical_replication.py + --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -149,12 +151,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -210,6 +214,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -226,6 +231,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -237,11 +243,13 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + # Post both success and failure to the Slack channel - name: Post to a Slack channel - if: ${{ github.event.schedule && failure() }} + if: ${{ github.event.schedule }} uses: slackapi/slack-github-action@v1 with: channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream @@ -444,6 +452,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -458,6 +467,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -472,6 +482,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -488,12 +499,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -545,12 +558,12 @@ jobs: arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') cd /home/nonroot - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb" - dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg - dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg - dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb" + dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg mkdir -p /tmp/neon/pg_install/v16/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench @@ -598,6 +611,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -612,6 +626,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -621,12 +636,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -722,6 +739,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -734,12 +752,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -836,6 +856,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -846,12 +867,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> @@ -934,6 +957,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -943,12 +967,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: | Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 82b065c524..9e7be76901 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -2,18 +2,13 @@ name: Build build-tools image on: workflow_call: - inputs: - image-tag: - description: "build-tools image tag" - required: true - type: string outputs: image-tag: description: "build-tools tag" - value: ${{ inputs.image-tag }} + value: ${{ jobs.check-image.outputs.tag }} image: description: "build-tools image" - value: neondatabase/build-tools:${{ inputs.image-tag }} + value: neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} defaults: run: @@ -35,7 +30,36 @@ permissions: {} jobs: check-image: - uses: ./.github/workflows/check-build-tools-image.yml + runs-on: ubuntu-22.04 + outputs: + tag: ${{ steps.get-build-tools-tag.outputs.image-tag }} + found: ${{ steps.check-image.outputs.found }} + + steps: + - uses: actions/checkout@v4 + + - name: Get build-tools image tag for the current commit + id: get-build-tools-tag + env: + IMAGE_TAG: | + ${{ hashFiles('build-tools.Dockerfile', + '.github/workflows/build-build-tools-image.yml') }} + run: | + echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT + + - name: Check if such tag found in the registry + id: check-image + env: + IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }} + run: | + if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + found=true + else + found=false + fi + + echo "found=${found}" | tee -a $GITHUB_OUTPUT + build-image: needs: [ check-image ] @@ -48,20 +72,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} - env: - IMAGE_TAG: ${{ inputs.image-tag }} - steps: - - name: Check `input.tag` is correct - env: - INPUTS_IMAGE_TAG: ${{ inputs.image-tag }} - CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }} - run: | - if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then - echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})" - exit 1 - fi - - uses: actions/checkout@v4 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -92,10 +103,10 @@ jobs: cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }} tags: | - neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} + neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} merge-images: - needs: [ build-image ] + needs: [ check-image, build-image ] runs-on: ubuntu-22.04 steps: @@ -107,7 +118,7 @@ jobs: - name: Create multi-arch image env: DEFAULT_DEBIAN_VERSION: bullseye - IMAGE_TAG: ${{ inputs.image-tag }} + IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | for debian_version in bullseye bookworm; do tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}") diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cc6f91d28e..89fd2d0d17 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -77,15 +77,9 @@ jobs: shell: bash id: build-tag - check-build-tools-image: - needs: [ check-permissions ] - uses: ./.github/workflows/check-build-tools-image.yml - build-build-tools-image: - needs: [ check-build-tools-image ] + needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml - with: - image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-codestyle-python: diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml deleted file mode 100644 index a7a15ad58b..0000000000 --- a/.github/workflows/check-build-tools-image.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Check build-tools image - -on: - workflow_call: - outputs: - image-tag: - description: "build-tools image tag" - value: ${{ jobs.check-image.outputs.tag }} - found: - description: "Whether the image is found in the registry" - value: ${{ jobs.check-image.outputs.found }} - -defaults: - run: - shell: bash -euo pipefail {0} - -# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. -permissions: {} - -jobs: - check-image: - runs-on: ubuntu-22.04 - outputs: - tag: ${{ steps.get-build-tools-tag.outputs.image-tag }} - found: ${{ steps.check-image.outputs.found }} - - steps: - - uses: actions/checkout@v4 - - - name: Get build-tools image tag for the current commit - id: get-build-tools-tag - env: - IMAGE_TAG: | - ${{ hashFiles('build-tools.Dockerfile', - '.github/workflows/check-build-tools-image.yml', - '.github/workflows/build-build-tools-image.yml') }} - run: | - echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT - - - name: Check if such tag found in the registry - id: check-image - env: - IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }} - run: | - if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then - found=true - else - found=false - fi - - echo "found=${found}" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index d770bb2bb5..1033dc6489 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -1,4 +1,4 @@ -name: Benchmarking +name: benchmarking ingest on: # uncomment to run on push for debugging your PR @@ -74,18 +74,16 @@ jobs: compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - name: Initialize Neon project and retrieve current backpressure seconds + - name: Initialize Neon project if: ${{ matrix.target_project == 'new_empty_project' }} env: - NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} + BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} run: | echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}" export LD_LIBRARY_PATH=${PG_16_LIB_PATH} - ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" - BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") - echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV - echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV + ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" + echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - name: Create Neon Branch for large tenant if: ${{ matrix.target_project == 'large_existing_project' }} @@ -95,266 +93,55 @@ jobs: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - name: Initialize Neon project and retrieve current backpressure seconds + - name: Initialize Neon project if: ${{ matrix.target_project == 'large_existing_project' }} env: - NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} + BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }} run: | echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}" export LD_LIBRARY_PATH=${PG_16_LIB_PATH} # Extract the part before the database name - base_connstr="${NEW_PROJECT_CONNSTR%/*}" + base_connstr="${BENCHMARK_INGEST_TARGET_CONNSTR%/*}" # Extract the query parameters (if any) after the database name - query_params="${NEW_PROJECT_CONNSTR#*\?}" + query_params="${BENCHMARK_INGEST_TARGET_CONNSTR#*\?}" # Reconstruct the new connection string - if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then + if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then new_connstr="${base_connstr}/neondb?${query_params}" else new_connstr="${base_connstr}/neondb" fi ${PSQL} "${new_connstr}" -c "drop database ludicrous;" ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;" - if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then - NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}" + if [ "$query_params" != "$BENCHMARK_INGEST_TARGET_CONNSTR" ]; then + BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous?${query_params}" else - NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous" + BENCHMARK_INGEST_TARGET_CONNSTR="${base_connstr}/ludicrous" fi - ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" - BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") - echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV - echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV - - - - name: Create pgcopydb filter file - run: | - cat << EOF > /tmp/pgcopydb_filter.txt - [include-only-table] - public.events - public.emails - public.email_transmissions - public.payments - public.editions - public.edition_modules - public.sp_content - public.email_broadcasts - public.user_collections - public.devices - public.user_accounts - public.lessons - public.lesson_users - public.payment_methods - public.orders - public.course_emails - public.modules - public.users - public.module_users - public.courses - public.payment_gateway_keys - public.accounts - public.roles - public.payment_gateways - public.management - public.event_names - EOF + ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" + echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - - name: Invoke pgcopydb + - name: Invoke pgcopydb + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: performance/test_perf_ingest_using_pgcopydb.py + run_in_parallel: false + extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb + pg_version: v16 + save_perf_report: true + aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: - BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} - run: | - export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH} - export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}" - export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}" - export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" - ${PG_CONFIG} --bindir - ${PGCOPYDB} --version - ${PGCOPYDB} clone --skip-vacuum --no-owner --no-acl --skip-db-properties --table-jobs 4 \ - --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \ - --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log + BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} + TARGET_PROJECT_TYPE: ${{ matrix.target_project }} + # we report PLATFORM in zenbenchmark NeonBenchmarker perf database and want to distinguish between new project and large tenant + PLATFORM: "${{ matrix.target_project }}-us-east-2-staging" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - # create dummy pgcopydb log to test parsing - # - name: create dummy log for parser test - # run: | - # cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log - # 2024-11-04 18:00:53.433 500861 INFO main.c:136 Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb" - # 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1225 [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60" - # 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1226 [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60" - # 2024-11-04 18:00:53.442 500861 INFO copydb.c:105 Using work dir "/tmp/pgcopydb" - # 2024-11-04 18:00:53.541 500861 INFO snapshot.c:107 Exported snapshot "00000008-00000033-1" from the source database - # 2024-11-04 18:00:53.556 500865 INFO cli_clone_follow.c:543 STEP 1: fetch source database tables, indexes, and sequences - # 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:716 Splitting source candidate tables larger than 10 GB - # 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:829 Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID - # 2024-11-04 18:01:05.538 500865 INFO copydb_schema.c:905 Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid. - # 2024-11-04 18:01:05.564 500865 INFO copydb_schema.c:905 Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id. - # 2024-11-04 18:01:05.584 500865 INFO copydb_schema.c:905 Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id. - # 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:905 Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id. - # 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:761 Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk - # 2024-11-04 18:01:05.687 500865 INFO copydb_schema.c:968 Fetched information for 57 indexes (supporting 25 constraints) - # 2024-11-04 18:01:05.753 500865 INFO sequences.c:78 Fetching information for 24 sequences - # 2024-11-04 18:01:05.903 500865 INFO copydb_schema.c:1122 Fetched information for 4 extensions - # 2024-11-04 18:01:06.178 500865 INFO copydb_schema.c:1538 Found 0 indexes (supporting 0 constraints) in the target database - # 2024-11-04 18:01:06.184 500865 INFO cli_clone_follow.c:584 STEP 2: dump the source database schema (pre/post data) - # 2024-11-04 18:01:06.186 500865 INFO pgcmd.c:468 /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' - # 2024-11-04 18:01:06.952 500865 INFO cli_clone_follow.c:592 STEP 3: restore the pre-data section to the target database - # 2024-11-04 18:01:07.004 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump - # 2024-11-04 18:01:07.438 500874 INFO table-data.c:656 STEP 4: starting 4 table-data COPY processes - # 2024-11-04 18:01:07.451 500877 INFO vacuum.c:139 STEP 8: skipping VACUUM jobs per --skip-vacuum - # 2024-11-04 18:01:07.457 500875 INFO indexes.c:182 STEP 6: starting 4 CREATE INDEX processes - # 2024-11-04 18:01:07.457 500875 INFO indexes.c:183 STEP 7: constraints are built by the CREATE INDEX processes - # 2024-11-04 18:01:07.507 500865 INFO blobs.c:74 Skipping large objects: none found. - # 2024-11-04 18:01:07.509 500865 INFO sequences.c:194 STEP 9: reset sequences values - # 2024-11-04 18:01:07.510 500886 INFO sequences.c:290 Set sequences values on the target database - # 2024-11-04 20:49:00.587 500865 INFO cli_clone_follow.c:608 STEP 10: restore the post-data section to the target database - # 2024-11-04 20:49:00.600 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump - # 2024-11-05 10:50:58.508 500865 INFO cli_clone_follow.c:639 All step are now done, 16h49m elapsed - # 2024-11-05 10:50:58.508 500865 INFO summary.c:3155 Printing summary for 26 tables and 57 indexes - - # OID | Schema | Name | Parts | copy duration | transmitted bytes | indexes | create index duration - # ------+--------+----------------------+-------+---------------+-------------------+---------+---------------------- - # 24654 | public | events | 10 | 1d11h | 878 GB | 1 | 1h41m - # 24623 | public | email_transmissions | 4 | 4h46m | 99 GB | 3 | 2h04m - # 24665 | public | lessons | 4 | 4h42m | 161 GB | 4 | 1m11s - # 24661 | public | lesson_users | 3 | 2h46m | 49 GB | 3 | 39m35s - # 24631 | public | emails | 1 | 34m07s | 10 GB | 2 | 17s - # 24739 | public | payments | 1 | 5m47s | 1848 MB | 4 | 4m40s - # 24681 | public | module_users | 1 | 4m57s | 1610 MB | 3 | 1m50s - # 24694 | public | orders | 1 | 2m50s | 835 MB | 3 | 1m05s - # 24597 | public | devices | 1 | 1m45s | 498 MB | 2 | 40s - # 24723 | public | payment_methods | 1 | 1m24s | 548 MB | 2 | 31s - # 24765 | public | user_collections | 1 | 2m17s | 1005 MB | 2 | 968ms - # 24774 | public | users | 1 | 52s | 291 MB | 4 | 27s - # 24760 | public | user_accounts | 1 | 16s | 172 MB | 3 | 16s - # 24606 | public | edition_modules | 1 | 8s983 | 46 MB | 3 | 4s749 - # 24583 | public | course_emails | 1 | 8s526 | 26 MB | 2 | 996ms - # 24685 | public | modules | 1 | 1s592 | 21 MB | 3 | 1s696 - # 24610 | public | editions | 1 | 2s199 | 7483 kB | 2 | 1s032 - # 24755 | public | sp_content | 1 | 1s555 | 4177 kB | 0 | 0ms - # 24619 | public | email_broadcasts | 1 | 744ms | 2645 kB | 2 | 677ms - # 24590 | public | courses | 1 | 387ms | 1540 kB | 2 | 367ms - # 24704 | public | payment_gateway_keys | 1 | 1s972 | 164 kB | 2 | 27ms - # 24576 | public | accounts | 1 | 58ms | 24 kB | 1 | 14ms - # 24647 | public | event_names | 1 | 32ms | 397 B | 1 | 8ms - # 24716 | public | payment_gateways | 1 | 1s675 | 117 B | 1 | 11ms - # 24748 | public | roles | 1 | 71ms | 173 B | 1 | 8ms - # 24676 | public | management | 1 | 33ms | 40 B | 1 | 19ms - - - # Step Connection Duration Transfer Concurrency - # -------------------------------------------------- ---------- ---------- ---------- ------------ - # Catalog Queries (table ordering, filtering, etc) source 12s 1 - # Dump Schema source 765ms 1 - # Prepare Schema target 466ms 1 - # COPY, INDEX, CONSTRAINTS, VACUUM (wall clock) both 2h47m 12 - # COPY (cumulative) both 7h46m 1225 GB 4 - # CREATE INDEX (cumulative) target 4h36m 4 - # CONSTRAINTS (cumulative) target 8s493 4 - # VACUUM (cumulative) target 0ms 4 - # Reset Sequences both 60ms 1 - # Large Objects (cumulative) (null) 0ms 0 - # Finalize Schema both 14h01m 4 - # -------------------------------------------------- ---------- ---------- ---------- ------------ - # Total Wall Clock Duration both 16h49m 20 - - - # EOF - - - - name: show tables sizes and retrieve current backpressure seconds + - name: show tables sizes after ingest run: | export LD_LIBRARY_PATH=${PG_16_LIB_PATH} - ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+" - BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") - echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV - - - name: Parse pgcopydb log and report performance metrics - env: - PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }} - run: | - export LD_LIBRARY_PATH=${PG_16_LIB_PATH} - - # Define the log file path - LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log" - - # Get the current git commit hash - git config --global --add safe.directory /__w/neon/neon - COMMIT_HASH=$(git rev-parse --short HEAD) - - # Define the platform and test suite - PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging" - SUIT="pgcopydb_ingest_bench" - - # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds - convert_to_seconds() { - local duration=$1 - local total_seconds=0 - - # Check for hours (h) - if [[ "$duration" =~ ([0-9]+)h ]]; then - total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600)) - fi - - # Check for seconds (s) - if [[ "$duration" =~ ([0-9]+)s ]]; then - total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0})) - fi - - # Check for milliseconds (ms) (if applicable) - if [[ "$duration" =~ ([0-9]+)ms ]]; then - total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000)) - duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m - fi - - # Check for minutes (m) - must be checked after ms because m is contained in ms - if [[ "$duration" =~ ([0-9]+)m ]]; then - total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60)) - fi - - echo $total_seconds - } - - # Calculate the backpressure difference in seconds - BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}") - - # Insert the backpressure time difference into the performance database - if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then - PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \" - INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp) - VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now()); - \"" - echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds" - eval $PSQL_CMD - fi - - # Extract and process log lines - while IFS= read -r line; do - METRIC_NAME="" - # Match each desired line and extract the relevant information - if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then - METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)" - elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then - METRIC_NAME="COPY (cumulative)" - elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then - METRIC_NAME="CREATE INDEX (cumulative)" - elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then - METRIC_NAME="CONSTRAINTS (cumulative)" - elif [[ "$line" =~ Finalize\ Schema.* ]]; then - METRIC_NAME="Finalize Schema" - elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then - METRIC_NAME="Total Wall Clock Duration" - fi - - # If a metric was matched, insert it into the performance database - if [ -n "$METRIC_NAME" ]; then - DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1) - METRIC_VALUE=$(convert_to_seconds "$DURATION") - PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \" - INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp) - VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now()); - \"" - echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds" - eval $PSQL_CMD - fi - done < "$LOG_FILE" + ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - name: Delete Neon Project if: ${{ always() && matrix.target_project == 'new_empty_project' }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index cd5a665402..092831adb9 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -26,15 +26,9 @@ jobs: with: github-event-name: ${{ github.event_name}} - check-build-tools-image: - needs: [ check-permissions ] - uses: ./.github/workflows/check-build-tools-image.yml - build-build-tools-image: - needs: [ check-build-tools-image ] + needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml - with: - image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-macos-build: @@ -44,7 +38,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 90 - runs-on: macos-14 + runs-on: macos-15 env: # Use release build only, to have less debug info around @@ -58,7 +52,7 @@ jobs: submodules: true - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf icu4c pkg-config + run: brew install flex bison openssl protobuf icu4c - name: Set pg 14 revision for caching id: pg_v14_rev diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 615937b5a1..1cce348ae2 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -72,7 +72,7 @@ jobs: echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV fi - - name: Start Bench with run_id + - name: Start Bench with run_id run: | curl -k -X 'POST' \ "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ @@ -116,7 +116,7 @@ jobs: -H 'accept: application/gzip' \ -H "Authorization: Bearer $API_KEY" \ --output "test_log_${GITHUB_RUN_ID}.gz" - + - name: Unzip Test Log and Print it into this job's log if: always() && steps.poll_step.outputs.too_many_runs != 'true' run: | @@ -134,13 +134,13 @@ jobs: if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - name: Cleanup Test Resources - if: always() + if: always() run: | curl -k -X 'POST' \ "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index df40b5beda..4f5495cbe2 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -39,15 +39,9 @@ jobs: with: github-event-name: ${{ github.event_name }} - check-build-tools-image: - needs: [ check-permissions ] - uses: ./.github/workflows/check-build-tools-image.yml - build-build-tools-image: - needs: [ check-build-tools-image ] + needs: [ check-permissions ] uses: ./.github/workflows/build-build-tools-image.yml - with: - image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit test-logical-replication: diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 137faa7abc..e1cec6d33d 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -34,16 +34,10 @@ jobs: run: | echo "${PYTHON_CHANGED_FILES}" - check-build-tools-image: + build-build-tools-image: if: needs.get-changed-files.outputs.python-changed == 'true' needs: [ get-changed-files ] - uses: ./.github/workflows/check-build-tools-image.yml - - build-build-tools-image: - needs: [ check-build-tools-image ] uses: ./.github/workflows/build-build-tools-image.yml - with: - image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-codestyle-python: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 56ef6f4bbb..11f010b6d4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -26,82 +26,26 @@ defaults: jobs: create-storage-release-branch: if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} - runs-on: ubuntu-22.04 permissions: - contents: write # for `git push` + contents: write - steps: - - name: Check out code - uses: actions/checkout@v4 - with: - ref: main - - - name: Set environment variables - run: | - echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - - - name: Create release branch - run: git checkout -b $RELEASE_BRANCH - - - name: Push new branch - run: git push origin $RELEASE_BRANCH - - - name: Create pull request into release - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - TITLE="Storage & Compute release ${RELEASE_DATE}" - - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ - --head "${RELEASE_BRANCH}" \ - --base "release" + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Storage & Compute' + release-branch: 'release' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-proxy-release-branch: if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} - runs-on: ubuntu-22.04 permissions: - contents: write # for `git push` + contents: write - steps: - - name: Check out code - uses: actions/checkout@v4 - with: - ref: main - - - name: Set environment variables - run: | - echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - - - name: Create release branch - run: git checkout -b $RELEASE_BRANCH - - - name: Push new branch - run: git push origin $RELEASE_BRANCH - - - name: Create pull request into release - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - TITLE="Proxy release ${RELEASE_DATE}" - - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ - --head "${RELEASE_BRANCH}" \ - --base "release-proxy" + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Proxy' + release-branch: 'release-proxy' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml index 98e394a3c2..2ed044b780 100644 --- a/.github/workflows/report-workflow-stats-batch.yml +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -4,10 +4,12 @@ on: schedule: - cron: '*/15 * * * *' - cron: '25 0 * * *' + - cron: '25 1 * * 6' jobs: - gh-workflow-stats-batch: - name: GitHub Workflow Stats Batch + gh-workflow-stats-batch-2h: + name: GitHub Workflow Stats Batch 2 hours + if: github.event.schedule == '*/15 * * * *' runs-on: ubuntu-22.04 permissions: actions: read @@ -16,14 +18,36 @@ jobs: uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '2h' - - name: Export Workflow Run for the past 24 hours - if: github.event.schedule == '25 0 * * *' + + gh-workflow-stats-batch-48h: + name: GitHub Workflow Stats Batch 48 hours + if: github.event.schedule == '25 0 * * *' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 48 hours uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} - duration: '24h' + duration: '48h' + + gh-workflow-stats-batch-30d: + name: GitHub Workflow Stats Batch 30 days + if: github.event.schedule == '25 1 * * 6' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 30 days + uses: neondatabase/gh-workflow-stats-action@v0.2.1 + with: + db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} + db_table: "gh_workflow_stats_neon" + gh_token: ${{ secrets.GITHUB_TOKEN }} + duration: '720h' diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml deleted file mode 100644 index 0d135a257c..0000000000 --- a/.github/workflows/report-workflow-stats.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Report Workflow Stats - -on: - workflow_run: - workflows: - - Add `external` label to issues and PRs created by external users - - Benchmarking - - Build and Test - - Build and Test Locally - - Build build-tools image - - Check Permissions - - Check build-tools image - - Check neon with extra platform builds - - Cloud Regression Test - - Create Release Branch - - Handle `approved-for-ci-run` label - - Lint GitHub Workflows - - Notify Slack channel about upcoming release - - Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region - - Pin build-tools image - - Prepare benchmarking databases by restoring dumps - - Push images to ACR - - Test Postgres client libraries - - Trigger E2E Tests - - cleanup caches by a branch - - Pre-merge checks - types: [completed] - -jobs: - gh-workflow-stats: - name: Github Workflow Stats - runs-on: ubuntu-22.04 - permissions: - actions: read - steps: - - name: Export GH Workflow Stats - uses: neondatabase/gh-workflow-stats-action@v0.1.4 - with: - DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - DB_TABLE: "gh_workflow_stats_neon" - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GH_RUN_ID: ${{ github.event.workflow_run.id }} diff --git a/CODEOWNERS b/CODEOWNERS index f8ed4be816..21b0e7c51f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,6 +1,5 @@ +/.github/ @neondatabase/developer-productivity /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/storage_controller @neondatabase/storage -/storage_scrubber @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage @@ -11,4 +10,6 @@ /pgxn/neon/ @neondatabase/compute @neondatabase/storage /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/storage +/storage_controller @neondatabase/storage +/storage_scrubber @neondatabase/storage /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index f92da5ec51..665aa4aecc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,6 +46,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e0966165eaf052580bd70eb1b32cb3d6245774c0104d1b2793e9650bf83b52a" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.16" @@ -146,6 +155,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "asn1-rs" version = "0.6.2" @@ -359,6 +374,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-sdk-kms" +version = "1.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.52.0" @@ -575,9 +612,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -742,7 +779,7 @@ dependencies = [ "once_cell", "paste", "pin-project", - "quick-xml", + "quick-xml 0.31.0", "rand 0.8.5", "reqwest 0.11.19", "rustc_version", @@ -1220,6 +1257,10 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-sdk-kms", + "aws-sdk-s3", + "base64 0.13.1", "bytes", "camino", "cfg-if", @@ -1237,13 +1278,16 @@ dependencies = [ "opentelemetry", "opentelemetry_sdk", "postgres", + "postgres_initdb", "prometheus", "regex", "remote_storage", "reqwest 0.12.4", "rlimit", "rust-ini", + "serde", "serde_json", + "serde_with", "signal-hook", "tar", "thiserror", @@ -1381,6 +1425,15 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpp_demangle" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d" +dependencies = [ + "cfg-if", +] + [[package]] name = "cpufeatures" version = "0.2.9" @@ -1904,6 +1957,26 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equator" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -2011,6 +2084,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2714,6 +2799,24 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" +[[package]] +name = "inferno" +version = "0.11.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" +dependencies = [ + "ahash", + "indexmap 2.0.1", + "is-terminal", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.26.0", + "rgb", + "str_stack", +] + [[package]] name = "inotify" version = "0.9.6" @@ -2764,9 +2867,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.9.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is-terminal" @@ -3053,6 +3156,15 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "memmap2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.7.1" @@ -3278,6 +3390,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -3578,7 +3700,6 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "toml_edit", "utils", "workspace_hack", ] @@ -3620,6 +3741,7 @@ dependencies = [ "num_cpus", "once_cell", "pageserver_api", + "pageserver_client", "pageserver_compaction", "pin-project-lite", "postgres", @@ -3628,6 +3750,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "postgres_ffi", + "postgres_initdb", "pq_proto", "procfs", "rand 0.8.5", @@ -3642,6 +3765,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_with", + "smallvec", "storage_broker", "strum", "strum_macros", @@ -4102,12 +4226,48 @@ dependencies = [ "utils", ] +[[package]] +name = "postgres_initdb" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "thiserror", + "tokio", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "criterion", + "findshlibs", + "inferno", + "libc", + "log", + "nix 0.26.4", + "once_cell", + "parking_lot 0.12.1", + "protobuf", + "protobuf-codegen-pure", + "smallvec", + "symbolic-demangle", + "tempfile", + "thiserror", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -4260,6 +4420,31 @@ dependencies = [ "prost", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + +[[package]] +name = "protobuf-codegen" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" +dependencies = [ + "protobuf", + "protobuf-codegen", +] + [[package]] name = "proxy" version = "0.1.0" @@ -4371,6 +4556,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quick-xml" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.31.0" @@ -4853,6 +5047,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "rgb" +version = "0.8.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.6" @@ -5166,6 +5369,7 @@ dependencies = [ "postgres-protocol", "postgres_backend", "postgres_ffi", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -5712,6 +5916,12 @@ dependencies = [ "der 0.7.8", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -5858,6 +6068,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.2" @@ -5905,6 +6121,29 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" +[[package]] +name = "symbolic-common" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366f1b4c6baf6cfefc234bbd4899535fca0b06c74443039a73f6dfb2fad88d77" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba05ba5b9962ea5617baf556293720a8b2d0a282aa14ee4bf10e22efc7da8c8" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "1.0.109" @@ -6772,6 +7011,7 @@ dependencies = [ "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -7306,6 +7546,7 @@ dependencies = [ "anyhow", "axum", "axum-core", + "base64 0.13.1", "base64 0.21.1", "base64ct", "bytes", @@ -7340,6 +7581,7 @@ dependencies = [ "libc", "log", "memchr", + "nix 0.26.4", "nom", "num-bigint", "num-integer", diff --git a/Cargo.toml b/Cargo.toml index dbda930535..e3dc5b97f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ members = [ "libs/vm_monitor", "libs/walproposer", "libs/wal_decoder", + "libs/postgres_initdb", ] [workspace.package] @@ -57,6 +58,7 @@ async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } aws-sdk-s3 = "1.52" aws-sdk-iam = "1.46.0" +aws-sdk-kms = "1.47.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.2" aws-credential-types = "1.2.0" @@ -73,7 +75,7 @@ bytes = "1.0" camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } -clap = { version = "4.0", features = ["derive"] } +clap = { version = "4.0", features = ["derive", "env"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" @@ -106,7 +108,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" -ipnet = "2.9.0" +ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" jsonwebtoken = "9" @@ -130,6 +132,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" +pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -153,7 +156,7 @@ sentry = { version = "0.32", default-features = false, features = ["backtrace", serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" -serde_with = "2.0" +serde_with = { version = "2.0", features = [ "base64" ] } serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" @@ -212,12 +215,14 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } +pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } +postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } diff --git a/README.md b/README.md index e68ef70bdf..1417d6b9e7 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. +Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index c1190b13f4..24e5bbf46f 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -234,7 +234,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.19 \ +ENV PYTHON_VERSION=3.11.10 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 32405ece86..7c21c67a0a 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1243,7 +1243,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ ######################################################################################### # -# Compile and run the Neon-specific `compute_ctl` binary +# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools @@ -1264,6 +1264,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de FROM debian:$DEBIAN_FLAVOR AS compute-tools-image COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import ######################################################################################### # @@ -1458,6 +1459,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import # pgbouncer and its config COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer @@ -1533,6 +1535,25 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 +# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 +# used by fast_import +ARG TARGETARCH +ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb +RUN set -ex; \ + \ + # Determine the expected checksum based on TARGETARCH + if [ "${TARGETARCH}" = "amd64" ]; then \ + CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + elif [ "${TARGETARCH}" = "arm64" ]; then \ + CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + else \ + echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ + fi; \ + \ + # Compute and validate the checksum + echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - +RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index d15d0cffeb..a4b93d0260 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -147,7 +147,7 @@ index 542c2e098c..0062d3024f 100644 ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; ALTER TABLE ptnowner OWNER TO regress_ptnowner; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out -index 97bbe53b64..eac3d42a79 100644 +index 3f9a8f539c..0a51b52940 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC'; @@ -309,7 +309,7 @@ index b48365ec98..a6ef910055 100644 -- the wrong partition. This test is *not* guaranteed to trigger that bug, but -- does so when shared_buffers is small enough. To test if we encountered the diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out -index faf1a4d1b0..a44c97db52 100644 +index 9a74820ee8..22400a5551 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -553,8 +553,8 @@ select * from check_con_tbl; @@ -573,7 +573,7 @@ index 93302a07ef..1a73f083ac 100644 -- that does not match with what's expected. -- This checks all the object types that include schema qualifications. diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out -index f3f8c7b5a2..3e3e54ff4c 100644 +index f551624afb..57f1e432d4 100644 --- a/src/test/regress/expected/create_view.out +++ b/src/test/regress/expected/create_view.out @@ -18,7 +18,8 @@ CREATE TABLE real_city ( @@ -700,12 +700,12 @@ index 6ed50fdcfa..caa00a345d 100644 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out -index 12e523c737..8872e23935 100644 +index 6b8c2f2414..8e13b7fa46 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out -@@ -1968,7 +1968,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2 - FOR VALUES IN (1600); - -- leave these tables around intentionally +@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES + ERROR: cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session + DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER; @@ -713,7 +713,7 @@ index 12e523c737..8872e23935 100644 set role regress_other_partitioned_fk_owner; create table other_partitioned_fk(a int, b int) partition by list (a); diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out -index 0f623f7119..b48588a54e 100644 +index 5881420388..4ae21aa43c 100644 --- a/src/test/regress/expected/generated.out +++ b/src/test/regress/expected/generated.out @@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR @@ -762,7 +762,7 @@ index a2036a1597..805d73b9d2 100644 -- fields, leading to long bucket chains and lots of table expansion. -- this is therefore a stress test of the bucket overflow code (unlike diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out -index cc7772349f..98a08eb48d 100644 +index 1b74958de9..078187b542 100644 --- a/src/test/regress/expected/identity.out +++ b/src/test/regress/expected/identity.out @@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT; @@ -775,10 +775,10 @@ index cc7772349f..98a08eb48d 100644 GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out -index 4943429e9b..0257f22b15 100644 +index 8f831c95c3..ec681b52af 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out -@@ -2606,7 +2606,7 @@ create index on permtest_parent (left(c, 3)); +@@ -2636,7 +2636,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; @@ -1133,7 +1133,7 @@ index 8475231735..1afae5395f 100644 SELECT rolname, rolpassword FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out -index fbb0489a4f..2905194e2c 100644 +index 5b9dba7b32..cc408dad42 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 @@ -1185,7 +1185,7 @@ index fbb0489a4f..2905194e2c 100644 GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION; -@@ -145,8 +145,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8; +@@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8; DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; @@ -1196,7 +1196,7 @@ index fbb0489a4f..2905194e2c 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; SET SESSION AUTHORIZATION regress_priv_user1; -@@ -172,12 +172,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre +@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; @@ -1213,7 +1213,7 @@ index fbb0489a4f..2905194e2c 100644 DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; -@@ -1709,7 +1713,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -1222,7 +1222,7 @@ index fbb0489a4f..2905194e2c 100644 -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ -@@ -2601,8 +2605,8 @@ drop cascades to function testns.priv_testagg(integer) +@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - @@ -1233,7 +1233,7 @@ index fbb0489a4f..2905194e2c 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; -@@ -2725,7 +2729,7 @@ DROP USER regress_priv_user7; +@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE @@ -1242,7 +1242,7 @@ index fbb0489a4f..2905194e2c 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; -@@ -2807,7 +2811,7 @@ DROP USER regress_locktable_user; +@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1251,7 +1251,7 @@ index fbb0489a4f..2905194e2c 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- -@@ -2851,10 +2855,10 @@ RESET ROLE; +@@ -2918,10 +2922,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery @@ -1266,7 +1266,7 @@ index fbb0489a4f..2905194e2c 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; -@@ -2883,9 +2887,9 @@ DROP ROLE regress_group_direct_manager; +@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -1813,7 +1813,7 @@ index 5e6969b173..2c4d52237f 100644 -- clean up roles diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out -index 97ca9bf72c..b2a7a6f710 100644 +index 218c0c2863..f7af0cfb12 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2; @@ -1917,6 +1917,19 @@ index b79fe9a1c0..e29fab88ab 100644 ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; +diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out +index afc6ab08c2..dfcd891af3 100644 +--- a/src/test/regress/expected/select_parallel.out ++++ b/src/test/regress/expected/select_parallel.out +@@ -1220,7 +1220,7 @@ SELECT 1 FROM tenk1_vw_sec + + rollback; + -- test that function option SET ROLE works in parallel workers. +-create role regress_parallel_worker; ++create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; + create function set_and_report_role() returns text as + $$ select current_setting('role') $$ language sql parallel safe + set role = regress_parallel_worker; diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index 1aeed8452b..7d9427d070 100644 --- a/src/test/regress/expected/select_views.out @@ -2369,7 +2382,7 @@ index 6cb9c926c0..5e689e4062 100644 ALTER TABLE ptnowner1 OWNER TO regress_ptnowner; ALTER TABLE ptnowner OWNER TO regress_ptnowner; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql -index 3db9e25913..c66d5aa2c2 100644 +index 8aa902d5ab..24bb823b86 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -353,7 +353,7 @@ reset enable_seqscan; @@ -2532,7 +2545,7 @@ index 43d2e906dd..6c993d70f0 100644 -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from -- the wrong partition. This test is *not* guaranteed to trigger that bug, but diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql -index d759635068..d58e50dcc5 100644 +index cf3828c16e..cf3ca38175 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -365,8 +365,8 @@ copy check_con_tbl from stdin; @@ -2774,7 +2787,7 @@ index 1b7064247a..be5b662ce1 100644 -- Cases where schema creation fails as objects are qualified with a schema -- that does not match with what's expected. diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql -index 3a78be1b0c..617d2dc8d6 100644 +index ae6841308b..47bc792e30 100644 --- a/src/test/regress/sql/create_view.sql +++ b/src/test/regress/sql/create_view.sql @@ -23,7 +23,8 @@ CREATE TABLE real_city ( @@ -2901,11 +2914,11 @@ index aa147b14a9..370e0dd570 100644 CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql -index 22e177f89b..7138d5e1d4 100644 +index 45c7a534cb..32dd26b8cd 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql -@@ -1418,7 +1418,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2 - -- leave these tables around intentionally +@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES + DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6; -- test the case when the referenced table is owned by a different user -create role regress_other_partitioned_fk_owner; @@ -2963,7 +2976,7 @@ index 527024f710..de49c0b85f 100644 -- the data in this file has a lot of duplicates in the index key -- fields, leading to long bucket chains and lots of table expansion. diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql -index 91d2e443b4..241c93f373 100644 +index 7537258a75..9041e35e34 100644 --- a/src/test/regress/sql/identity.sql +++ b/src/test/regress/sql/identity.sql @@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART; @@ -2976,10 +2989,10 @@ index 91d2e443b4..241c93f373 100644 GRANT SELECT, INSERT ON itest8 TO regress_identity_user1; SET ROLE regress_identity_user1; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql -index fe699c54d5..bdd5993f45 100644 +index b5b554a125..109889ad24 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql -@@ -950,7 +950,7 @@ create index on permtest_parent (left(c, 3)); +@@ -958,7 +958,7 @@ create index on permtest_parent (left(c, 3)); insert into permtest_parent select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i; analyze permtest_parent; @@ -3218,7 +3231,7 @@ index 53e86b0b6c..f07cf1ec54 100644 CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql -index 3f68cafcd1..004b26831d 100644 +index 249df17a58..b258e7f26a 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; @@ -3269,7 +3282,7 @@ index 3f68cafcd1..004b26831d 100644 GRANT pg_read_all_data TO regress_priv_user6; GRANT pg_write_all_data TO regress_priv_user7; -@@ -130,8 +130,8 @@ DROP USER regress_priv_user10; +@@ -163,8 +163,8 @@ DROP USER regress_priv_user10; DROP USER regress_priv_user9; DROP USER regress_priv_user8; @@ -3280,7 +3293,7 @@ index 3f68cafcd1..004b26831d 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; -@@ -1124,7 +1124,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -3289,7 +3302,7 @@ index 3f68cafcd1..004b26831d 100644 -- Check that index expressions and predicates are run as the table's owner -@@ -1620,8 +1620,8 @@ DROP SCHEMA testns CASCADE; +@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - @@ -3300,7 +3313,7 @@ index 3f68cafcd1..004b26831d 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; -@@ -1715,7 +1715,7 @@ DROP USER regress_priv_user8; -- does not exist +@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE @@ -3309,7 +3322,7 @@ index 3f68cafcd1..004b26831d 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission -@@ -1803,7 +1803,7 @@ DROP USER regress_locktable_user; +@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - @@ -3318,7 +3331,7 @@ index 3f68cafcd1..004b26831d 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no -@@ -1823,10 +1823,10 @@ RESET ROLE; +@@ -1856,10 +1856,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery @@ -3333,7 +3346,7 @@ index 3f68cafcd1..004b26831d 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; -@@ -1848,9 +1848,9 @@ DROP ROLE regress_group_indirect_manager; +@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -3625,7 +3638,7 @@ index c961b2d730..0859b89c4f 100644 -- clean up roles DROP ROLE regress_test_def_superuser; diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql -index dec7340538..cdbc03a5cc 100644 +index d3bfd53e23..919ce1d0c6 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE; @@ -3701,6 +3714,19 @@ index 689c448cc2..223ceb1d75 100644 ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user REVOKE INSERT ON TABLES FROM regress_selinto_user; GRANT ALL ON SCHEMA selinto_schema TO public; +diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql +index 33d78e16dc..cb193c9b27 100644 +--- a/src/test/regress/sql/select_parallel.sql ++++ b/src/test/regress/sql/select_parallel.sql +@@ -464,7 +464,7 @@ SELECT 1 FROM tenk1_vw_sec + rollback; + + -- test that function option SET ROLE works in parallel workers. +-create role regress_parallel_worker; ++create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER; + + create function set_and_report_role() returns text as + $$ select current_setting('role') $$ language sql parallel safe diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql index e742f13699..7bd0255df8 100644 --- a/src/test/regress/sql/select_views.sql diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0bf4ed53d6..c0c390caef 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,10 @@ default = [] testing = [] [dependencies] +base64.workspace = true +aws-config.workspace = true +aws-sdk-s3.workspace = true +aws-sdk-kms.workspace = true anyhow.workspace = true camino.workspace = true chrono.workspace = true @@ -27,6 +31,8 @@ opentelemetry.workspace = true opentelemetry_sdk.workspace = true postgres.workspace = true regex.workspace = true +serde.workspace = true +serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,6 +49,7 @@ thiserror.workspace = true url.workspace = true prometheus.workspace = true +postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 284db005c8..4689cc2b83 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -105,6 +105,11 @@ fn main() -> Result<()> { fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + opentelemetry::global::set_error_handler(|err| { + tracing::info!("OpenTelemetry error: {err}"); + }) + .expect("global error handler lock poisoned"); + let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs new file mode 100644 index 0000000000..3b0b990df2 --- /dev/null +++ b/compute_tools/src/bin/fast_import.rs @@ -0,0 +1,338 @@ +//! This program dumps a remote Postgres database into a local Postgres database +//! and uploads the resulting PGDATA into object storage for import into a Timeline. +//! +//! # Context, Architecture, Design +//! +//! See cloud.git Fast Imports RFC () +//! for the full picture. +//! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline +//! is publicly accessible at . +//! +//! # This is a Prototype! +//! +//! This program is part of a prototype feature and not yet used in production. +//! +//! The cloud.git RFC contains lots of suggestions for improving e2e throughput +//! of this step of the timeline import process. +//! +//! # Local Testing +//! +//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Build the image with the following command: +//! +//! ```bash +//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com +//! docker push localhost:3030/localregistry/compute-node-v14:latest +//! ``` + +use anyhow::Context; +use aws_config::BehaviorVersion; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use nix::unistd::Pid; +use tracing::{info, info_span, warn, Instrument}; +use utils::fs_ext::is_directory_empty; + +#[path = "fast_import/child_stdio_to_log.rs"] +mod child_stdio_to_log; +#[path = "fast_import/s3_uri.rs"] +mod s3_uri; +#[path = "fast_import/s5cmd.rs"] +mod s5cmd; + +#[derive(clap::Parser)] +struct Args { + #[clap(long)] + working_directory: Utf8PathBuf, + #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] + s3_prefix: s3_uri::S3Uri, + #[clap(long)] + pg_bin_dir: Utf8PathBuf, + #[clap(long)] + pg_lib_dir: Utf8PathBuf, +} + +#[serde_with::serde_as] +#[derive(serde::Deserialize)] +struct Spec { + encryption_secret: EncryptionSecret, + #[serde_as(as = "serde_with::base64::Base64")] + source_connstring_ciphertext_base64: Vec, +} + +#[derive(serde::Deserialize)] +enum EncryptionSecret { + #[allow(clippy::upper_case_acronyms)] + KMS { key_id: String }, +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Plain, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let Args { + working_directory, + s3_prefix, + pg_bin_dir, + pg_lib_dir, + } = Args::parse(); + + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + + let spec: Spec = { + let spec_key = s3_prefix.append("/spec.json"); + let s3_client = aws_sdk_s3::Client::new(&aws_config); + let object = s3_client + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + }; + + match tokio::fs::create_dir(&working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&working_directory) + .await + .context("check if working directory is empty")? + { + anyhow::bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + let pgdata_dir = working_directory.join("pgdata"); + tokio::fs::create_dir(&pgdata_dir) + .await + .context("create pgdata directory")?; + + // + // Setup clients + // + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let kms_client = aws_sdk_kms::Client::new(&aws_config); + + // + // Initialize pgdata + // + let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser, + locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded, + pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in + initdb_bin: pg_bin_dir.join("initdb").as_ref(), + library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &pgdata_dir, + }) + .await + .context("initdb")?; + + let nproc = num_cpus::get(); + + // + // Launch postgres process + // + let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres")) + .arg("-D") + .arg(&pgdata_dir) + .args(["-c", "wal_level=minimal"]) + .args(["-c", "shared_buffers=10GB"]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args(["-c", "maintenance_work_mem=8388608"]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + postgres_proc.stdout.take(), + postgres_proc.stderr.take(), + ) + .instrument(info_span!("postgres")), + ); + let restore_pg_connstring = + format!("host=localhost port=5432 user={superuser} dbname=postgres"); + loop { + let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await; + if res.is_ok() { + info!("postgres is ready, could connect to it"); + break; + } + } + + // + // Decrypt connection string + // + let source_connection_string = { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + spec.source_connstring_ciphertext_base64, + )) + .send() + .await + .context("decrypt source connection string")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext source connection string")?; + String::from_utf8(plaintext.into_inner()) + .context("parse source connection string as utf8")? + } + } + }; + + // + // Start the work + // + + let dumpdir = working_directory.join("dumpdir"); + + let common_args = [ + // schema mapping (prob suffices to specify them on one side) + "--no-owner".to_string(), + "--no-privileges".to_string(), + "--no-publications".to_string(), + "--no-security-labels".to_string(), + "--no-subscriptions".to_string(), + "--no-tablespaces".to_string(), + // format + "--format".to_string(), + "directory".to_string(), + // concurrency + "--jobs".to_string(), + num_cpus::get().to_string(), + // progress updates + "--verbose".to_string(), + ]; + + info!("dump into the working directory"); + { + let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump")) + .args(&common_args) + .arg("-f") + .arg(&dumpdir) + .arg("--no-sync") + // POSITIONAL args + // source db (db name included in connection string) + .arg(&source_connection_string) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_dump")?; + + info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump"); + + tokio::spawn( + child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take()) + .instrument(info_span!("pg_dump")), + ); + + let st = pg_dump.wait().await.context("wait for pg_dump")?; + info!(status=?st, "pg_dump exited"); + if !st.success() { + warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + } + } + + // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: do the unlogged table trick + + info!("restore from working directory into vanilla postgres"); + { + let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) + .args(&common_args) + .arg("-d") + .arg(&restore_pg_connstring) + // POSITIONAL args + .arg(&dumpdir) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_restore")?; + + info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + pg_restore.stdout.take(), + pg_restore.stderr.take(), + ) + .instrument(info_span!("pg_restore")), + ); + let st = pg_restore.wait().await.context("wait for pg_restore")?; + info!(status=?st, "pg_restore exited"); + if !st.success() { + warn!(status=%st, "pg_restore failed, restore will likely fail as well"); + } + } + + info!("shutdown postgres"); + { + nix::sys::signal::kill( + Pid::from_raw( + i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), + ), + nix::sys::signal::SIGTERM, + ) + .context("signal postgres to shut down")?; + postgres_proc + .wait() + .await + .context("wait for postgres to shut down")?; + } + + info!("upload pgdata"); + s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = working_directory.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("status"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + .await + .context("sync status directory to destination")?; + } + + Ok(()) +} diff --git a/compute_tools/src/bin/fast_import/child_stdio_to_log.rs b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs new file mode 100644 index 0000000000..6724ef9bed --- /dev/null +++ b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs @@ -0,0 +1,35 @@ +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::{ChildStderr, ChildStdout}; +use tracing::info; + +/// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log. +/// Each line is read and logged individually, with lossy UTF-8 conversion. +/// +/// # Arguments +/// +/// * `stdout`: An `Option` from the child process. +/// * `stderr`: An `Option` from the child process. +/// +pub(crate) async fn relay_process_output(stdout: Option, stderr: Option) { + let stdout_fut = async { + if let Some(stdout) = stdout { + let reader = BufReader::new(stdout); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stdout", "{}", line); + } + } + }; + + let stderr_fut = async { + if let Some(stderr) = stderr { + let reader = BufReader::new(stderr); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stderr", "{}", line); + } + } + }; + + tokio::join!(stdout_fut, stderr_fut); +} diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs new file mode 100644 index 0000000000..52bbef420f --- /dev/null +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -0,0 +1,75 @@ +use anyhow::Result; +use std::str::FromStr; + +/// Struct to hold parsed S3 components +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct S3Uri { + pub bucket: String, + pub key: String, +} + +impl FromStr for S3Uri { + type Err = anyhow::Error; + + /// Parse an S3 URI into a bucket and key + fn from_str(uri: &str) -> Result { + // Ensure the URI starts with "s3://" + if !uri.starts_with("s3://") { + return Err(anyhow::anyhow!("Invalid S3 URI scheme")); + } + + // Remove the "s3://" prefix + let stripped_uri = &uri[5..]; + + // Split the remaining string into bucket and key parts + if let Some((bucket, key)) = stripped_uri.split_once('/') { + Ok(S3Uri { + bucket: bucket.to_string(), + key: key.to_string(), + }) + } else { + Err(anyhow::anyhow!( + "Invalid S3 URI format, missing bucket or key" + )) + } + } +} + +impl S3Uri { + pub fn append(&self, suffix: &str) -> Self { + Self { + bucket: self.bucket.clone(), + key: format!("{}{}", self.key, suffix), + } + } +} + +impl std::fmt::Display for S3Uri { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "s3://{}/{}", self.bucket, self.key) + } +} + +impl clap::builder::TypedValueParser for S3Uri { + type Value = Self; + + fn parse_ref( + &self, + _cmd: &clap::Command, + _arg: Option<&clap::Arg>, + value: &std::ffi::OsStr, + ) -> Result { + let value_str = value.to_str().ok_or_else(|| { + clap::Error::raw( + clap::error::ErrorKind::InvalidUtf8, + "Invalid UTF-8 sequence", + ) + })?; + S3Uri::from_str(value_str).map_err(|e| { + clap::Error::raw( + clap::error::ErrorKind::InvalidValue, + format!("Failed to parse S3 URI: {}", e), + ) + }) + } +} diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/s5cmd.rs new file mode 100644 index 0000000000..d2d9a79736 --- /dev/null +++ b/compute_tools/src/bin/fast_import/s5cmd.rs @@ -0,0 +1,27 @@ +use anyhow::Context; +use camino::Utf8Path; + +use super::s3_uri::S3Uri; + +pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { + let mut builder = tokio::process::Command::new("s5cmd"); + // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL + if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { + builder.arg("--endpoint-url").arg(val); + } + builder + .arg("sync") + .arg(local.as_str()) + .arg(remote.to_string()); + let st = builder + .spawn() + .context("spawn s5cmd")? + .wait() + .await + .context("wait for s5cmd")?; + if st.success() { + Ok(()) + } else { + Err(anyhow::anyhow!("s5cmd failed")) + } +} diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 4fefa831e0..2f6f82dd39 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,38 +1,40 @@ -use compute_api::{ - responses::CatalogObjects, - spec::{Database, Role}, -}; +use compute_api::responses::CatalogObjects; use futures::Stream; -use postgres::{Client, NoTls}; +use postgres::NoTls; use std::{path::Path, process::Stdio, result::Result, sync::Arc}; use tokio::{ io::{AsyncBufReadExt, BufReader}, process::Command, - task, + spawn, }; +use tokio_postgres::connect; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; -use crate::{ - compute::ComputeNode, - pg_helpers::{get_existing_dbs, get_existing_roles}, -}; +use crate::compute::ComputeNode; +use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async}; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let connstr = compute.connstr.clone(); - task::spawn_blocking(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - let roles: Vec; - { - let mut xact = client.transaction()?; - roles = get_existing_roles(&mut xact)?; - } - let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); - Ok(CatalogObjects { roles, databases }) - }) - .await? + let (client, connection): (tokio_postgres::Client, _) = + connect(connstr.as_str(), NoTls).await?; + + spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let roles = get_existing_roles_async(&client).await?; + + let databases = get_existing_dbs_async(&client) + .await? + .into_values() + .collect(); + + Ok(CatalogObjects { roles, databases }) } #[derive(Debug, thiserror::Error)] diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index d76eaad0a0..cec2b1bed8 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,37 +1,9 @@ use anyhow::{anyhow, Ok, Result}; -use postgres::Client; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; use crate::compute::ComputeNode; -/// Create a special service table for availability checks -/// only if it does not exist already. -pub fn create_availability_check_data(client: &mut Client) -> Result<()> { - let query = " - DO $$ - BEGIN - IF NOT EXISTS( - SELECT 1 - FROM pg_catalog.pg_tables - WHERE tablename = 'health_check' - ) - THEN - CREATE TABLE health_check ( - id serial primary key, - updated_at timestamptz default now() - ); - INSERT INTO health_check VALUES (1, now()) - ON CONFLICT (id) DO UPDATE - SET updated_at = now(); - END IF; - END - $$;"; - client.execute(query, &[])?; - - Ok(()) -} - /// Update timestamp in a row in a special service table to check /// that we can actually write some data in this particular timeline. #[instrument(skip_all)] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0a8cb14058..4f67425ba8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,20 +1,21 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::env; use std::fs; +use std::iter::once; use std::os::unix::fs::{symlink, PermissionsExt}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; -use std::sync::{Condvar, Mutex, RwLock}; +use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::thread; use std::time::Duration; use std::time::Instant; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::PgIdent; +use compute_api::spec::{PgIdent, Role}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -31,15 +32,23 @@ use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; - use remote_storage::{DownloadError, RemotePath}; +use tokio::spawn; +use url::Url; -use crate::checker::create_availability_check_data; use crate::installed_extensions::get_installed_extensions_sync; use crate::local_proxy; -use crate::logger::inlinify; use crate::pg_helpers::*; use crate::spec::*; +use crate::spec_apply::ApplySpecPhase::{ + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser, + DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions, + RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, +}; +use crate::spec_apply::PerDatabasePhase::{ + ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension, +}; +use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; use crate::{config, extension_server}; @@ -224,10 +233,7 @@ fn maybe_cgexec(cmd: &str) -> Command { } } -/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser -/// that we give to customers -#[instrument(skip_all)] -fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { +pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String { let roles = spec .cluster .roles @@ -296,11 +302,8 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> $$;"#, roles_decl, database_decl, ); - info!("Neon superuser created: {}", inlinify(&query)); - client - .simple_query(&query) - .map_err(|e| anyhow::anyhow!(e).context(query))?; - Ok(()) + + query } impl ComputeNode { @@ -813,21 +816,14 @@ impl ComputeNode { Ok(()) } - /// Do initial configuration of the already started Postgres. - #[instrument(skip_all)] - pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - // If connection fails, - // it may be the old node with `zenith_admin` superuser. - // - // In this case we need to connect with old `zenith_admin` name - // and create new user. We cannot simply rename connected user, - // but we can create a new one and grant it all privileges. - let mut connstr = self.connstr.clone(); + async fn get_maintenance_client(url: &Url) -> Result { + let mut connstr = url.clone(); + connstr .query_pairs_mut() .append_pair("application_name", "apply_config"); - let mut client = match Client::connect(connstr.as_str(), NoTls) { + let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { @@ -845,8 +841,8 @@ impl ComputeNode { let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; - // Disable forwarding so that users don't get a cloud_admin role + // Disable forwarding so that users don't get a cloud_admin role let mut func = || { client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; @@ -858,49 +854,309 @@ impl ComputeNode { drop(client); // reconnect with connstring with expected name - Client::connect(connstr.as_str(), NoTls)? + tokio_postgres::connect(connstr.as_str(), NoTls).await? } _ => return Err(e.into()), }, - Ok(client) => client, + Ok((client, conn)) => (client, conn), }; - // Disable DDL forwarding because control plane already knows about these roles/databases. + spawn(async move { + if let Err(e) = conn.await { + error!("maintenance client connection error: {}", e); + } + }); + + // Disable DDL forwarding because control plane already knows about the roles/databases + // we're about to modify. client .simple_query("SET neon.forward_ddl = false") + .await .context("apply_config SET neon.forward_ddl = false")?; - // Proceed with post-startup configuration. Note, that order of operations is important. - let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; - cleanup_instance(&mut client).context("apply_config cleanup_instance")?; - handle_roles(spec, &mut client).context("apply_config handle_roles")?; - handle_databases(spec, &mut client).context("apply_config handle_databases")?; - handle_role_deletions(spec, connstr.as_str(), &mut client) - .context("apply_config handle_role_deletions")?; - handle_grants( - spec, - &mut client, - connstr.as_str(), - self.has_feature(ComputeFeature::AnonExtension), - ) - .context("apply_config handle_grants")?; - handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; - handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; - create_availability_check_data(&mut client) - .context("apply_config create_availability_check_data")?; + Ok(client) + } - // 'Close' connection - drop(client); + /// Apply the spec to the running PostgreSQL instance. + /// The caller can decide to run with multiple clients in parallel, or + /// single mode. Either way, the commands executed will be the same, and + /// only commands run in different databases are parallelized. + #[instrument(skip_all)] + pub fn apply_spec_sql( + &self, + spec: Arc, + url: Arc, + concurrency: usize, + ) -> Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; - if let Some(ref local_proxy) = spec.local_proxy_config { + info!("Applying config with max {} concurrency", concurrency); + debug!("Config: {:?}", spec); + + rt.block_on(async { + // Proceed with post-startup configuration. Note, that order of operations is important. + let client = Self::get_maintenance_client(&url).await?; + let spec = spec.clone(); + + let databases = get_existing_dbs_async(&client).await?; + let roles = get_existing_roles_async(&client) + .await? + .into_iter() + .map(|role| (role.name.clone(), role)) + .collect::>(); + + let jwks_roles = Arc::new( + spec.as_ref() + .local_proxy_config + .iter() + .flat_map(|it| &it.jwks) + .flatten() + .flat_map(|setting| &setting.role_names) + .cloned() + .collect::>(), + ); + + let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { + roles, + dbs: databases, + })); + + for phase in [ + CreateSuperUser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + ] { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + let db_processes = spec + .cluster + .databases + .iter() + .map(|db| DB::new(db.clone())) + // include + .chain(once(DB::SystemDB)) + .map(|db| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut url = url.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + let db = db.clone(); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + url.set_path(db.name.as_str()); + } + } + + let url = Arc::new(url); + let fut = Self::apply_spec_sql_db( + spec.clone(), + url, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + ); + + Ok(spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + handle.await??; + } + + for phase in vec![ + HandleOtherExtensions, + HandleNeonExtension, + CreateAvailabilityCheck, + DropRoles, + ] { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + Ok::<(), anyhow::Error>(()) + })?; + + Ok(()) + } + + /// Apply SQL migrations of the RunInEachDatabase phase. + /// + /// May opt to not connect to databases that don't have any scheduled + /// operations. The function is concurrency-controlled with the provided + /// semaphore. The caller has to make sure the semaphore isn't exhausted. + async fn apply_spec_sql_db( + spec: Arc, + url: Arc, + ctx: Arc>, + jwks_roles: Arc>, + concurrency_token: Arc, + db: DB, + ) -> Result<()> { + let _permit = concurrency_token.acquire().await?; + + let mut client_conn = None; + + for subphase in [ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ] { + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + RunInEachDatabase { + db: db.clone(), + subphase, + }, + // Only connect if apply_operation actually wants a connection. + // It's quite possible this database doesn't need any queries, + // so by not connecting we save time and effort connecting to + // that database. + || async { + if client_conn.is_none() { + let db_client = Self::get_maintenance_client(&url).await?; + client_conn.replace(db_client); + } + let client = client_conn.as_ref().unwrap(); + Ok(client) + }, + ) + .await?; + } + + drop(client_conn); + + Ok::<(), anyhow::Error>(()) + } + + /// Do initial configuration of the already started Postgres. + #[instrument(skip_all)] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin` name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut url = self.connstr.clone(); + url.query_pairs_mut() + .append_pair("application_name", "apply_config"); + + let url = Arc::new(url); + let spec = Arc::new( + compute_state + .pspec + .as_ref() + .expect("spec must be set") + .spec + .clone(), + ); + + // Choose how many concurrent connections to use for applying the spec changes. + // If the cluster is not currently Running we don't have to deal with user connections, + // and can thus use all `max_connections` connection slots. However, that's generally not + // very efficient, so we generally still limit it to a smaller number. + let max_concurrent_connections = if compute_state.status != ComputeStatus::Running { + // If the settings contain 'max_connections', use that as template + if let Some(config) = spec.cluster.settings.find("max_connections") { + config.parse::().ok() + } else { + // Otherwise, try to find the setting in the postgresql_conf string + spec.cluster + .postgresql_conf + .iter() + .flat_map(|conf| conf.split("\n")) + .filter_map(|line| { + if !line.contains("max_connections") { + return None; + } + + let (key, value) = line.split_once("=")?; + let key = key + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + let value = value + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + if key != "max_connections" { + return None; + } + + value.parse::().ok() + }) + .next() + } + // If max_connections is present, use at most 1/3rd of that. + // When max_connections is lower than 30, try to use at least 10 connections, but + // never more than max_connections. + .map(|limit| match limit { + 0..10 => limit, + 10..30 => 10, + 30.. => limit / 3, + }) + // If we didn't find max_connections, default to 10 concurrent connections. + .unwrap_or(10) + } else { + // state == Running + // Because the cluster is already in the Running state, we should assume users are + // already connected to the cluster, and high concurrency could negatively + // impact user connectivity. Therefore, we can limit concurrency to the number of + // reserved superuser connections, which users wouldn't be able to use anyway. + spec.cluster + .settings + .find("superuser_reserved_connections") + .iter() + .filter_map(|val| val.parse::().ok()) + .map(|val| if val > 1 { val - 1 } else { 1 }) + .last() + .unwrap_or(3) + }; + + // Merge-apply spec & changes to PostgreSQL state. + self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?; + + if let Some(ref local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } // Run migrations separately to not hold up cold starts thread::spawn(move || { - let mut connstr = connstr.clone(); + let mut connstr = url.as_ref().clone(); connstr .query_pairs_mut() .append_pair("application_name", "migrations"); @@ -908,7 +1164,8 @@ impl ComputeNode { let mut client = Client::connect(connstr.as_str(), NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); - Ok(()) + + Ok::<(), anyhow::Error>(()) } // Wrapped this around `pg_ctl reload`, but right now we don't use @@ -971,32 +1228,16 @@ impl ComputeNode { config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; - let mut client = Client::connect(self.connstr.as_str(), NoTls)?; - - // Proceed with post-startup configuration. Note, that order of operations is important. - // Disable DDL forwarding because control plane already knows about these roles/databases. if spec.mode == ComputeMode::Primary { - client.simple_query("SET neon.forward_ddl = false")?; - cleanup_instance(&mut client)?; - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants( - &spec, - &mut client, - self.connstr.as_str(), - self.has_feature(ComputeFeature::AnonExtension), - )?; - handle_extensions(&spec, &mut client)?; - handle_extension_neon(&mut client)?; - // We can skip handle_migrations here because a new migration can only appear - // if we have a new version of the compute_ctl binary, which can only happen - // if compute got restarted, in which case we'll end up inside of apply_config - // instead of reconfigure. - } + let mut url = self.connstr.clone(); + url.query_pairs_mut() + .append_pair("application_name", "apply_config"); + let url = Arc::new(url); - // 'Close' connection - drop(client); + let spec = Arc::new(spec.clone()); + + self.apply_spec_sql(spec, url, 1)?; + } Ok(()) })?; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d4e413034e..d65fe73194 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -116,7 +116,7 @@ pub fn write_postgres_conf( vartype: "enum".to_owned(), }; - write!(file, "{}", opt.to_pg_setting())?; + writeln!(file, "{}", opt.to_pg_setting())?; } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 3677582c11..8a047634df 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -20,6 +20,7 @@ use anyhow::Result; use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use metrics::proto::MetricFamily; use metrics::Encoder; use metrics::TextEncoder; use tokio::task; @@ -72,10 +73,22 @@ async fn routes(req: Request, compute: &Arc) -> Response { debug!("serving /metrics GET request"); - let mut buffer = vec![]; - let metrics = installed_extensions::collect(); + // When we call TextEncoder::encode() below, it will immediately + // return an error if a metric family has no metrics, so we need to + // preemptively filter out metric families with no metrics. + let metrics = installed_extensions::collect() + .into_iter() + .filter(|m| !m.get_metric().is_empty()) + .collect::>(); + let encoder = TextEncoder::new(); - encoder.encode(&metrics, &mut buffer).unwrap(); + let mut buffer = vec![]; + + if let Err(err) = encoder.encode(&metrics, &mut buffer) { + let msg = format!("error handling /metrics request: {err}"); + error!(msg); + return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR); + } match Response::builder() .status(StatusCode::OK) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 6dd55855db..79d8b2ca04 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -115,7 +115,7 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> { static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "installed_extensions", + "compute_installed_extensions", "Number of databases where the version of extension is installed", &["extension_name", "version"] ) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index d27ae58fa2..ee4cf2dfa5 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -23,5 +23,6 @@ pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +mod spec_apply; pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b2dc265864..4a1e5ee0e8 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,9 +10,9 @@ use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; -use postgres::{Client, Transaction}; use tokio::io::AsyncBufReadExt; use tokio::time::timeout; use tokio_postgres::NoTls; @@ -197,27 +197,34 @@ impl Escaping for PgIdent { } /// Build a list of existing Postgres roles -pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result> { - let postgres_roles = xact - .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])? - .iter() +pub async fn get_existing_roles_async(client: &tokio_postgres::Client) -> Result> { + let postgres_roles = client + .query_raw::( + "SELECT rolname, rolpassword FROM pg_catalog.pg_authid", + &[], + ) + .await? + .filter_map(|row| async { row.ok() }) .map(|row| Role { name: row.get("rolname"), encrypted_password: row.get("rolpassword"), options: None, }) - .collect(); + .collect() + .await; Ok(postgres_roles) } /// Build a list of existing Postgres databases -pub fn get_existing_dbs(client: &mut Client) -> Result> { +pub async fn get_existing_dbs_async( + client: &tokio_postgres::Client, +) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 - let postgres_dbs: Vec = client - .query( + let rowstream = client + .query_raw::( "SELECT datname AS name, datdba::regrole::text AS owner, @@ -226,8 +233,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result FROM pg_catalog.pg_database;", &[], - )? - .iter() + ) + .await?; + + let dbs_map = rowstream + .filter_map(|r| async { r.ok() }) .map(|row| Database { name: row.get("name"), owner: row.get("owner"), @@ -235,12 +245,9 @@ pub fn get_existing_dbs(client: &mut Client) -> Result invalid: row.get("invalid"), options: None, }) - .collect(); - - let dbs_map = postgres_dbs - .iter() .map(|db| (db.name.clone(), db.clone())) - .collect::>(); + .collect::>() + .await; Ok(dbs_map) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 73f3d1006a..c7d2deb090 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,22 +1,17 @@ -use std::collections::HashSet; +use anyhow::{anyhow, bail, Result}; +use postgres::Client; +use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use std::str::FromStr; - -use anyhow::{anyhow, bail, Context, Result}; -use postgres::config::Config; -use postgres::{Client, NoTls}; -use reqwest::StatusCode; -use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; +use tracing::{error, info, instrument, warn}; use crate::config; -use crate::logger::inlinify; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; -use compute_api::spec::{ComputeSpec, PgIdent, Role}; +use compute_api::spec::ComputeSpec; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -151,625 +146,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { Ok(()) } -/// Compute could be unexpectedly shut down, for example, during the -/// database dropping. This leaves the database in the invalid state, -/// which prevents new db creation with the same name. This function -/// will clean it up before proceeding with catalog updates. All -/// possible future cleanup operations may go here too. -#[instrument(skip_all)] -pub fn cleanup_instance(client: &mut Client) -> Result<()> { - let existing_dbs = get_existing_dbs(client)?; - - for (_, db) in existing_dbs { - if db.invalid { - // After recent commit in Postgres, interrupted DROP DATABASE - // leaves the database in the invalid state. According to the - // commit message, the only option for user is to drop it again. - // See: - // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 - // - // Postgres Neon extension is done the way, that db is de-registered - // in the control plane metadata only after it is dropped. So there is - // a chance that it still thinks that db should exist. This means - // that it will be re-created by `handle_databases()`. Yet, it's fine - // as user can just repeat drop (in vanilla Postgres they would need - // to do the same, btw). - let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()); - info!("dropping invalid database {}", db.name); - client.execute(query.as_str(), &[])?; - } - } - - Ok(()) -} - -/// Given a cluster spec json and open transaction it handles roles creation, -/// deletion and update. -#[instrument(skip_all)] -pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - let mut xact = client.transaction()?; - let existing_roles: Vec = get_existing_roles(&mut xact)?; - - let mut jwks_roles = HashSet::new(); - if let Some(local_proxy) = &spec.local_proxy_config { - for jwks_setting in local_proxy.jwks.iter().flatten() { - for role_name in &jwks_setting.role_names { - jwks_roles.insert(role_name.clone()); - } - } - } - - // Print a list of existing Postgres roles (only in debug mode) - if span_enabled!(Level::INFO) { - let mut vec = Vec::new(); - for r in &existing_roles { - vec.push(format!( - "{}:{}", - r.name, - if r.encrypted_password.is_some() { - "[FILTERED]" - } else { - "(null)" - } - )); - } - - info!("postgres roles (total {}): {:?}", vec.len(), vec); - } - - // Process delta operations first - if let Some(ops) = &spec.delta_operations { - info!("processing role renames"); - for op in ops { - match op.action.as_ref() { - "delete_role" => { - // no-op now, roles will be deleted at the end of configuration - } - // Renaming role drops its password, since role name is - // used as a salt there. It is important that this role - // is recorded with a new `name` in the `roles` list. - // Follow up roles update will set the new password. - "rename_role" => { - let new_name = op.new_name.as_ref().unwrap(); - - // XXX: with a limited number of roles it is fine, but consider making it a HashMap - if existing_roles.iter().any(|r| r.name == op.name) { - let query: String = format!( - "ALTER ROLE {} RENAME TO {}", - op.name.pg_quote(), - new_name.pg_quote() - ); - - warn!("renaming role '{}' to '{}'", op.name, new_name); - xact.execute(query.as_str(), &[])?; - } - } - _ => {} - } - } - } - - // Refresh Postgres roles info to handle possible roles renaming - let existing_roles: Vec = get_existing_roles(&mut xact)?; - - info!( - "handling cluster spec roles (total {})", - spec.cluster.roles.len() - ); - for role in &spec.cluster.roles { - let name = &role.name; - // XXX: with a limited number of roles it is fine, but consider making it a HashMap - let pg_role = existing_roles.iter().find(|r| r.name == *name); - - enum RoleAction { - None, - Update, - Create, - } - let action = if let Some(r) = pg_role { - if (r.encrypted_password.is_none() && role.encrypted_password.is_some()) - || (r.encrypted_password.is_some() && role.encrypted_password.is_none()) - { - RoleAction::Update - } else if let Some(pg_pwd) = &r.encrypted_password { - // Check whether password changed or not (trim 'md5' prefix first if any) - // - // This is a backward compatibility hack, which comes from the times when we were using - // md5 for everyone and hashes were stored in the console db without md5 prefix. So when - // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix, - // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix. - // Here is the only place so far where we compare hashes, so it seems to be the best candidate - // to place this compatibility layer. - let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") { - stripped - } else { - pg_pwd - }; - if pg_pwd != *role.encrypted_password.as_ref().unwrap() { - RoleAction::Update - } else { - RoleAction::None - } - } else { - RoleAction::None - } - } else { - RoleAction::Create - }; - - match action { - RoleAction::None => {} - RoleAction::Update => { - // This can be run on /every/ role! Not just ones created through the console. - // This means that if you add some funny ALTER here that adds a permission, - // this will get run even on user-created roles! This will result in different - // behavior before and after a spec gets reapplied. The below ALTER as it stands - // now only grants LOGIN and changes the password. Please do not allow this branch - // to do anything silly. - let mut query: String = format!("ALTER ROLE {} ", name.pg_quote()); - query.push_str(&role.to_pg_options()); - xact.execute(query.as_str(), &[])?; - } - RoleAction::Create => { - // This branch only runs when roles are created through the console, so it is - // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited - // from neon_superuser. - let mut query: String = format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", - name.pg_quote() - ); - if jwks_roles.contains(name.as_str()) { - query = format!("CREATE ROLE {}", name.pg_quote()); - } - info!("running role create query: '{}'", &query); - query.push_str(&role.to_pg_options()); - xact.execute(query.as_str(), &[])?; - } - } - - if span_enabled!(Level::INFO) { - let pwd = if role.encrypted_password.is_some() { - "[FILTERED]" - } else { - "(null)" - }; - let action_str = match action { - RoleAction::None => "", - RoleAction::Create => " -> create", - RoleAction::Update => " -> update", - }; - info!(" - {}:{}{}", name, pwd, action_str); - } - } - - xact.commit()?; - - Ok(()) -} - -/// Reassign all dependent objects and delete requested roles. -#[instrument(skip_all)] -pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { - if let Some(ops) = &spec.delta_operations { - // First, reassign all dependent objects to db owners. - info!("reassigning dependent objects of to-be-deleted roles"); - - // Fetch existing roles. We could've exported and used `existing_roles` from - // `handle_roles()`, but we only make this list there before creating new roles. - // Which is probably fine as we never create to-be-deleted roles, but that'd - // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared - // buffers already, so this shouldn't be a big deal. - let mut xact = client.transaction()?; - let existing_roles: Vec = get_existing_roles(&mut xact)?; - xact.commit()?; - - for op in ops { - // Check that role is still present in Postgres, as this could be a - // restart with the same spec after role deletion. - if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) { - reassign_owned_objects(spec, connstr, &op.name)?; - } - } - - // Second, proceed with role deletions. - info!("processing role deletions"); - let mut xact = client.transaction()?; - for op in ops { - // We do not check either role exists or not, - // Postgres will take care of it for us - if op.action == "delete_role" { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote()); - - warn!("deleting role '{}'", &op.name); - xact.execute(query.as_str(), &[])?; - } - } - xact.commit()?; - } - - Ok(()) -} - -fn reassign_owned_objects_in_one_db( - conf: Config, - role_name: &PgIdent, - db_owner: &PgIdent, -) -> Result<()> { - let mut client = conf.connect(NoTls)?; - - // This will reassign all dependent objects to the db owner - let reassign_query = format!( - "REASSIGN OWNED BY {} TO {}", - role_name.pg_quote(), - db_owner.pg_quote() - ); - info!( - "reassigning objects owned by '{}' in db '{}' to '{}'", - role_name, - conf.get_dbname().unwrap_or(""), - db_owner - ); - client.simple_query(&reassign_query)?; - - // This now will only drop privileges of the role - let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); - client.simple_query(&drop_query)?; - Ok(()) -} - -// Reassign all owned objects in all databases to the owner of the database. -fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { - for db in &spec.cluster.databases { - if db.owner != *role_name { - let mut conf = Config::from_str(connstr)?; - conf.dbname(&db.name); - reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?; - } - } - - // Also handle case when there are no databases in the spec. - // In this case we need to reassign objects in the default database. - let conf = Config::from_str(connstr)?; - let db_owner = PgIdent::from_str("cloud_admin")?; - reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?; - - Ok(()) -} - -/// It follows mostly the same logic as `handle_roles()` excepting that we -/// does not use an explicit transactions block, since major database operations -/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level -/// atomicity should be enough here due to the order of operations and various checks, -/// which together provide us idempotency. -#[instrument(skip_all)] -pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - let existing_dbs = get_existing_dbs(client)?; - - // Print a list of existing Postgres databases (only in debug mode) - if span_enabled!(Level::INFO) { - let mut vec = Vec::new(); - for (dbname, db) in &existing_dbs { - vec.push(format!("{}:{}", dbname, db.owner)); - } - info!("postgres databases (total {}): {:?}", vec.len(), vec); - } - - // Process delta operations first - if let Some(ops) = &spec.delta_operations { - info!("processing delta operations on databases"); - for op in ops { - match op.action.as_ref() { - // We do not check either DB exists or not, - // Postgres will take care of it for us - "delete_db" => { - // In Postgres we can't drop a database if it is a template. - // So we need to unset the template flag first, but it could - // be a retry, so we could've already dropped the database. - // Check that database exists first to make it idempotent. - let unset_template_query: String = format!( - " - DO $$ - BEGIN - IF EXISTS( - SELECT 1 - FROM pg_catalog.pg_database - WHERE datname = {} - ) - THEN - ALTER DATABASE {} is_template false; - END IF; - END - $$;", - escape_literal(&op.name), - &op.name.pg_quote() - ); - // Use FORCE to drop database even if there are active connections. - // We run this from `cloud_admin`, so it should have enough privileges. - // NB: there could be other db states, which prevent us from dropping - // the database. For example, if db is used by any active subscription - // or replication slot. - // TODO: deal with it once we allow logical replication. Proper fix should - // involve returning an error code to the control plane, so it could - // figure out that this is a non-retryable error, return it to the user - // and fail operation permanently. - let drop_db_query: String = format!( - "DROP DATABASE IF EXISTS {} WITH (FORCE)", - &op.name.pg_quote() - ); - - warn!("deleting database '{}'", &op.name); - client.execute(unset_template_query.as_str(), &[])?; - client.execute(drop_db_query.as_str(), &[])?; - } - "rename_db" => { - let new_name = op.new_name.as_ref().unwrap(); - - if existing_dbs.contains_key(&op.name) { - let query: String = format!( - "ALTER DATABASE {} RENAME TO {}", - op.name.pg_quote(), - new_name.pg_quote() - ); - - warn!("renaming database '{}' to '{}'", op.name, new_name); - client.execute(query.as_str(), &[])?; - } - } - _ => {} - } - } - } - - // Refresh Postgres databases info to handle possible renames - let existing_dbs = get_existing_dbs(client)?; - - info!( - "handling cluster spec databases (total {})", - spec.cluster.databases.len() - ); - for db in &spec.cluster.databases { - let name = &db.name; - let pg_db = existing_dbs.get(name); - - enum DatabaseAction { - None, - Update, - Create, - } - let action = if let Some(r) = pg_db { - // XXX: db owner name is returned as quoted string from Postgres, - // when quoting is needed. - let new_owner = if r.owner.starts_with('"') { - db.owner.pg_quote() - } else { - db.owner.clone() - }; - - if new_owner != r.owner { - // Update the owner - DatabaseAction::Update - } else { - DatabaseAction::None - } - } else { - DatabaseAction::Create - }; - - match action { - DatabaseAction::None => {} - DatabaseAction::Update => { - let query: String = format!( - "ALTER DATABASE {} OWNER TO {}", - name.pg_quote(), - db.owner.pg_quote() - ); - let _guard = info_span!("executing", query).entered(); - client.execute(query.as_str(), &[])?; - } - DatabaseAction::Create => { - let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); - query.push_str(&db.to_pg_options()); - let _guard = info_span!("executing", query).entered(); - client.execute(query.as_str(), &[])?; - let grant_query: String = format!( - "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", - name.pg_quote() - ); - client.execute(grant_query.as_str(), &[])?; - } - }; - - if span_enabled!(Level::INFO) { - let action_str = match action { - DatabaseAction::None => "", - DatabaseAction::Create => " -> create", - DatabaseAction::Update => " -> update", - }; - info!(" - {}:{}{}", db.name, db.owner, action_str); - } - } - - Ok(()) -} - -/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants -/// to allow users creating trusted extensions and re-creating `public` schema, for example. -#[instrument(skip_all)] -pub fn handle_grants( - spec: &ComputeSpec, - client: &mut Client, - connstr: &str, - enable_anon_extension: bool, -) -> Result<()> { - info!("modifying database permissions"); - let existing_dbs = get_existing_dbs(client)?; - - // Do some per-database access adjustments. We'd better do this at db creation time, - // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants - // atomically. - for db in &spec.cluster.databases { - match existing_dbs.get(&db.name) { - Some(pg_db) => { - if pg_db.restrict_conn || pg_db.invalid { - info!( - "skipping grants for db {} (invalid: {}, connections not allowed: {})", - db.name, pg_db.invalid, pg_db.restrict_conn - ); - continue; - } - } - None => { - bail!( - "database {} doesn't exist in Postgres after handle_databases()", - db.name - ); - } - } - - let mut conf = Config::from_str(connstr)?; - conf.dbname(&db.name); - - let mut db_client = conf.connect(NoTls)?; - - // This will only change ownership on the schema itself, not the objects - // inside it. Without it owner of the `public` schema will be `cloud_admin` - // and database owner cannot do anything with it. SQL procedure ensures - // that it won't error out if schema `public` doesn't exist. - let alter_query = format!( - "DO $$\n\ - DECLARE\n\ - schema_owner TEXT;\n\ - BEGIN\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - )\n\ - THEN\n\ - SELECT nspowner::regrole::text\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - INTO schema_owner;\n\ - \n\ - IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\ - THEN\n\ - ALTER SCHEMA public OWNER TO {};\n\ - END IF;\n\ - END IF;\n\ - END\n\ - $$;", - db.owner.pg_quote() - ); - db_client.simple_query(&alter_query)?; - - // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. - // This is needed because since postgres 15 this privilege is removed by default. - // TODO: web_access isn't created for almost 1 year. It could be that we have - // active users of 1 year old projects, but hopefully not, so check it and - // remove this code if possible. The worst thing that could happen is that - // user won't be able to use public schema in NEW databases created in the - // very OLD project. - // - // Also, alter default permissions so that relations created by extensions can be - // used by neon_superuser without permission issues. - let grant_query = "DO $$\n\ - BEGIN\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - ) AND\n\ - current_setting('server_version_num')::int/10000 >= 15\n\ - THEN\n\ - IF EXISTS(\n\ - SELECT rolname\n\ - FROM pg_catalog.pg_roles\n\ - WHERE rolname = 'web_access'\n\ - )\n\ - THEN\n\ - GRANT CREATE ON SCHEMA public TO web_access;\n\ - END IF;\n\ - END IF;\n\ - IF EXISTS(\n\ - SELECT nspname\n\ - FROM pg_catalog.pg_namespace\n\ - WHERE nspname = 'public'\n\ - )\n\ - THEN\n\ - ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\ - ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\ - END IF;\n\ - END\n\ - $$;" - .to_string(); - - info!( - "grant query for db {} : {}", - &db.name, - inlinify(&grant_query) - ); - db_client.simple_query(&grant_query)?; - - // it is important to run this after all grants - if enable_anon_extension { - handle_extension_anon(spec, &db.owner, &mut db_client, false) - .context("handle_grants handle_extension_anon")?; - } - } - - Ok(()) -} - -/// Create required system extensions -#[instrument(skip_all)] -pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { - if libs.contains("pg_stat_statements") { - // Create extension only if this compute really needs it - let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements"; - info!("creating system extensions with query: {}", query); - client.simple_query(query)?; - } - } - - Ok(()) -} - -/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database -#[instrument(skip_all)] -pub fn handle_extension_neon(client: &mut Client) -> Result<()> { - info!("handle extension neon"); - - let mut query = "CREATE SCHEMA IF NOT EXISTS neon"; - client.simple_query(query)?; - - query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"; - info!("create neon extension with query: {}", query); - client.simple_query(query)?; - - query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'"; - client.simple_query(query)?; - - query = "ALTER EXTENSION neon SET SCHEMA neon"; - info!("alter neon extension schema with query: {}", query); - client.simple_query(query)?; - - // this will be a no-op if extension is already up to date, - // which may happen in two cases: - // - extension was just installed - // - extension was already installed and is up to date - let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension version with query: {}", query); - if let Err(e) = client.simple_query(query) { - error!( - "failed to upgrade neon extension during `handle_extension_neon`: {}", - e - ); - } - - Ok(()) -} - #[instrument(skip_all)] pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { info!("handle neon extension upgrade"); diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs new file mode 100644 index 0000000000..7308d5d36e --- /dev/null +++ b/compute_tools/src/spec_apply.rs @@ -0,0 +1,680 @@ +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Formatter}; +use std::future::Future; +use std::iter::empty; +use std::iter::once; +use std::sync::Arc; + +use crate::compute::construct_superuser_query; +use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; +use anyhow::{bail, Result}; +use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use futures::future::join_all; +use tokio::sync::RwLock; +use tokio_postgres::Client; +use tracing::{debug, info_span, Instrument}; + +#[derive(Clone)] +pub enum DB { + SystemDB, + UserDB(Database), +} + +impl DB { + pub fn new(db: Database) -> DB { + Self::UserDB(db) + } + + pub fn is_owned_by(&self, role: &PgIdent) -> bool { + match self { + DB::SystemDB => false, + DB::UserDB(db) => &db.owner == role, + } + } +} + +impl Debug for DB { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DB::SystemDB => f.debug_tuple("SystemDB").finish(), + DB::UserDB(db) => f.debug_tuple("UserDB").field(&db.name).finish(), + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum PerDatabasePhase { + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, +} + +#[derive(Clone, Debug)] +pub enum ApplySpecPhase { + CreateSuperUser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, + HandleOtherExtensions, + HandleNeonExtension, + CreateAvailabilityCheck, + DropRoles, +} + +pub struct Operation { + pub query: String, + pub comment: Option, +} + +pub struct MutableApplyContext { + pub roles: HashMap, + pub dbs: HashMap, +} + +/// Appply the operations that belong to the given spec apply phase. +/// +/// Commands within a single phase are executed in order of Iterator yield. +/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database +/// indicated by its `db` field, and can share a single client for all changes +/// to that database. +/// +/// Notes: +/// - Commands are pipelined, and thus may cause incomplete apply if one +/// command of many fails. +/// - Failing commands will fail the phase's apply step once the return value +/// is processed. +/// - No timeouts have (yet) been implemented. +/// - The caller is responsible for limiting and/or applying concurrency. +pub async fn apply_operations<'a, Fut, F>( + spec: Arc, + ctx: Arc>, + jwks_roles: Arc>, + apply_spec_phase: ApplySpecPhase, + client: F, +) -> Result<()> +where + F: FnOnce() -> Fut, + Fut: Future>, +{ + debug!("Starting phase {:?}", &apply_spec_phase); + let span = info_span!("db_apply_changes", phase=?apply_spec_phase); + let span2 = span.clone(); + async move { + debug!("Processing phase {:?}", &apply_spec_phase); + let ctx = ctx; + + let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase) + .await? + .peekable(); + + // Return (and by doing so, skip requesting the PostgreSQL client) if + // we don't have any operations scheduled. + if ops.peek().is_none() { + return Ok(()); + } + + let client = client().await?; + + debug!("Applying phase {:?}", &apply_spec_phase); + + let active_queries = ops + .map(|op| { + let Operation { comment, query } = op; + let inspan = match comment { + None => span.clone(), + Some(comment) => info_span!("phase {}: {}", comment), + }; + + async { + let query = query; + let res = client.simple_query(&query).await; + debug!( + "{} {}", + if res.is_ok() { + "successfully executed" + } else { + "failed to execute" + }, + query + ); + res + } + .instrument(inspan) + }) + .collect::>(); + + drop(ctx); + + for it in join_all(active_queries).await { + drop(it?); + } + + debug!("Completed phase {:?}", &apply_spec_phase); + + Ok(()) + } + .instrument(span2) + .await +} + +/// Create a stream of operations to be executed for that phase of applying +/// changes. +/// +/// In the future we may generate a single stream of changes and then +/// sort/merge/batch execution, but for now this is a nice way to improve +/// batching behaviour of the commands. +async fn get_operations<'a>( + spec: &'a ComputeSpec, + ctx: &'a RwLock, + jwks_roles: &'a HashSet, + apply_spec_phase: &'a ApplySpecPhase, +) -> Result + 'a + Send>> { + match apply_spec_phase { + ApplySpecPhase::CreateSuperUser => { + let query = construct_superuser_query(spec); + + Ok(Box::new(once(Operation { + query, + comment: None, + }))) + } + ApplySpecPhase::DropInvalidDatabases => { + let mut ctx = ctx.write().await; + let databases = &mut ctx.dbs; + + let keys: Vec<_> = databases + .iter() + .filter(|(_, db)| db.invalid) + .map(|(dbname, _)| dbname.clone()) + .collect(); + + // After recent commit in Postgres, interrupted DROP DATABASE + // leaves the database in the invalid state. According to the + // commit message, the only option for user is to drop it again. + // See: + // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 + // + // Postgres Neon extension is done the way, that db is de-registered + // in the control plane metadata only after it is dropped. So there is + // a chance that it still thinks that the db should exist. This means + // that it will be re-created by the `CreateDatabases` phase. This + // is fine, as user can just drop the table again (in vanilla + // Postgres they would need to do the same). + let operations = keys + .into_iter() + .filter_map(move |dbname| ctx.dbs.remove(&dbname)) + .map(|db| Operation { + query: format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()), + comment: Some(format!("Dropping invalid database {}", db.name)), + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RenameRoles => { + let mut ctx = ctx.write().await; + + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "rename_role") + .filter_map(move |op| { + let roles = &mut ctx.roles; + + if roles.contains_key(op.name.as_str()) { + None + } else { + let new_name = op.new_name.as_ref().unwrap(); + let mut role = roles.remove(op.name.as_str()).unwrap(); + + role.name = new_name.clone(); + role.encrypted_password = None; + roles.insert(role.name.clone(), role); + + Some(Operation { + query: format!( + "ALTER ROLE {} RENAME TO {}", + op.name.pg_quote(), + new_name.pg_quote() + ), + comment: Some(format!("renaming role '{}' to '{}'", op.name, new_name)), + }) + } + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAndAlterRoles => { + let mut ctx = ctx.write().await; + + let operations = spec.cluster.roles + .iter() + .filter_map(move |role| { + let roles = &mut ctx.roles; + let db_role = roles.get(&role.name); + + match db_role { + Some(db_role) => { + if db_role.encrypted_password != role.encrypted_password { + // This can be run on /every/ role! Not just ones created through the console. + // This means that if you add some funny ALTER here that adds a permission, + // this will get run even on user-created roles! This will result in different + // behavior before and after a spec gets reapplied. The below ALTER as it stands + // now only grants LOGIN and changes the password. Please do not allow this branch + // to do anything silly. + Some(Operation { + query: format!( + "ALTER ROLE {} {}", + role.name.pg_quote(), + role.to_pg_options(), + ), + comment: None, + }) + } else { + None + } + } + None => { + let query = if !jwks_roles.contains(role.name.as_str()) { + format!( + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}", + role.name.pg_quote(), + role.to_pg_options(), + ) + } else { + format!( + "CREATE ROLE {} {}", + role.name.pg_quote(), + role.to_pg_options(), + ) + }; + Some(Operation { + query, + comment: Some(format!("creating role {}", role.name)), + }) + } + } + }); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RenameAndDeleteDatabases => { + let mut ctx = ctx.write().await; + + let operations = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + let databases = &mut ctx.dbs; + match op.action.as_str() { + // We do not check whether the DB exists or not, + // Postgres will take care of it for us + "delete_db" => { + // In Postgres we can't drop a database if it is a template. + // So we need to unset the template flag first, but it could + // be a retry, so we could've already dropped the database. + // Check that database exists first to make it idempotent. + let unset_template_query: String = format!( + include_str!("sql/unset_template_for_drop_dbs.sql"), + datname_str = escape_literal(&op.name), + datname = &op.name.pg_quote() + ); + + // Use FORCE to drop database even if there are active connections. + // We run this from `cloud_admin`, so it should have enough privileges. + // NB: there could be other db states, which prevent us from dropping + // the database. For example, if db is used by any active subscription + // or replication slot. + // TODO: deal with it once we allow logical replication. Proper fix should + // involve returning an error code to the control plane, so it could + // figure out that this is a non-retryable error, return it to the user + // and fail operation permanently. + let drop_db_query: String = format!( + "DROP DATABASE IF EXISTS {} WITH (FORCE)", + &op.name.pg_quote() + ); + + databases.remove(&op.name); + + Some(vec![ + Operation { + query: unset_template_query, + comment: Some(format!( + "optionally clearing template flags for DB {}", + op.name, + )), + }, + Operation { + query: drop_db_query, + comment: Some(format!("deleting database {}", op.name,)), + }, + ]) + } + "rename_db" => { + if let Some(mut db) = databases.remove(&op.name) { + // update state of known databases + let new_name = op.new_name.as_ref().unwrap(); + db.name = new_name.clone(); + databases.insert(db.name.clone(), db); + + Some(vec![Operation { + query: format!( + "ALTER DATABASE {} RENAME TO {}", + op.name.pg_quote(), + new_name.pg_quote(), + ), + comment: Some(format!( + "renaming database '{}' to '{}'", + op.name, new_name + )), + }]) + } else { + None + } + } + _ => None, + } + }) + .flatten(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAndAlterDatabases => { + let mut ctx = ctx.write().await; + + let operations = spec + .cluster + .databases + .iter() + .filter_map(move |db| { + let databases = &mut ctx.dbs; + if let Some(edb) = databases.get_mut(&db.name) { + let change_owner = if edb.owner.starts_with('"') { + db.owner.pg_quote() != edb.owner + } else { + db.owner != edb.owner + }; + + edb.owner = db.owner.clone(); + + if change_owner { + Some(vec![Operation { + query: format!( + "ALTER DATABASE {} OWNER TO {}", + db.name.pg_quote(), + db.owner.pg_quote() + ), + comment: Some(format!( + "changing database owner of database {} to {}", + db.name, db.owner + )), + }]) + } else { + None + } + } else { + databases.insert(db.name.clone(), db.clone()); + + Some(vec![ + Operation { + query: format!( + "CREATE DATABASE {} {}", + db.name.pg_quote(), + db.to_pg_options(), + ), + comment: None, + }, + Operation { + query: format!( + "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", + db.name.pg_quote() + ), + comment: None, + }, + ]) + } + }) + .flatten(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::RunInEachDatabase { db, subphase } => { + match subphase { + PerDatabasePhase::DeleteDBRoleReferences => { + let ctx = ctx.read().await; + + let operations = + spec.delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .filter_map(move |op| { + if db.is_owned_by(&op.name) { + return None; + } + if !ctx.roles.contains_key(&op.name) { + return None; + } + let quoted = op.name.pg_quote(); + let new_owner = match &db { + DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), + DB::UserDB(db) => db.owner.pg_quote(), + }; + + Some(vec![ + // This will reassign all dependent objects to the db owner + Operation { + query: format!( + "REASSIGN OWNED BY {} TO {}", + quoted, new_owner, + ), + comment: None, + }, + // This now will only drop privileges of the role + Operation { + query: format!("DROP OWNED BY {}", quoted), + comment: None, + }, + ]) + }) + .flatten(); + + Ok(Box::new(operations)) + } + PerDatabasePhase::ChangeSchemaPerms => { + let ctx = ctx.read().await; + let databases = &ctx.dbs; + + let db = match &db { + // ignore schema permissions on the system database + DB::SystemDB => return Ok(Box::new(empty())), + DB::UserDB(db) => db, + }; + + if databases.get(&db.name).is_none() { + bail!("database {} doesn't exist in PostgreSQL", db.name); + } + + let edb = databases.get(&db.name).unwrap(); + + if edb.restrict_conn || edb.invalid { + return Ok(Box::new(empty())); + } + + let operations = vec![ + Operation { + query: format!( + include_str!("sql/set_public_schema_owner.sql"), + db_owner = db.owner.pg_quote() + ), + comment: None, + }, + Operation { + query: String::from(include_str!("sql/default_grants.sql")), + comment: None, + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + PerDatabasePhase::HandleAnonExtension => { + // Only install Anon into user databases + let db = match &db { + DB::SystemDB => return Ok(Box::new(empty())), + DB::UserDB(db) => db, + }; + // Never install Anon when it's not enabled as feature + if !spec.features.contains(&ComputeFeature::AnonExtension) { + return Ok(Box::new(empty())); + } + + // Only install Anon when it's added in preload libraries + let opt_libs = spec.cluster.settings.find("shared_preload_libraries"); + + let libs = match opt_libs { + Some(libs) => libs, + None => return Ok(Box::new(empty())), + }; + + if !libs.contains("anon") { + return Ok(Box::new(empty())); + } + + let db_owner = db.owner.pg_quote(); + + let operations = vec![ + // Create anon extension if this compute needs it + // Users cannot create it themselves, because superuser is required. + Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"), + comment: Some(String::from("creating anon extension")), + }, + // Initialize anon extension + // This also requires superuser privileges, so users cannot do it themselves. + Operation { + query: String::from("SELECT anon.init()"), + comment: Some(String::from("initializing anon extension data")), + }, + Operation { + query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner), + comment: Some(String::from( + "granting anon extension schema permissions", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", + db_owner + ), + comment: Some(String::from( + "granting anon extension schema functions permissions", + )), + }, + // We need this, because some functions are defined as SECURITY DEFINER. + // In Postgres SECURITY DEFINER functions are executed with the privileges + // of the owner. + // In anon extension this it is needed to access some GUCs, which are only accessible to + // superuser. But we've patched postgres to allow db_owner to access them as well. + // So we need to change owner of these functions to db_owner. + Operation { + query: format!( + include_str!("sql/anon_ext_fn_reassign.sql"), + db_owner = db_owner, + ), + comment: Some(String::from( + "change anon extension functions owner to database_owner", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", + db_owner, + ), + comment: Some(String::from( + "granting anon extension tables permissions", + )), + }, + Operation { + query: format!( + "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", + db_owner, + ), + comment: Some(String::from( + "granting anon extension sequences permissions", + )), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + } + } + // Interestingly, we only install p_s_s in the main database, even when + // it's preloaded. + ApplySpecPhase::HandleOtherExtensions => { + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("pg_stat_statements") { + return Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"), + comment: Some(String::from("create system extensions")), + }))); + } + } + Ok(Box::new(empty())) + } + ApplySpecPhase::HandleNeonExtension => { + let operations = vec![ + Operation { + query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), + comment: Some(String::from("init: add schema for extension")), + }, + Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"), + comment: Some(String::from( + "init: install the extension if not already installed", + )), + }, + Operation { + query: String::from( + "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'", + ), + comment: Some(String::from("compat/fix: make neon relocatable")), + }, + Operation { + query: String::from("ALTER EXTENSION neon SET SCHEMA neon"), + comment: Some(String::from("compat/fix: alter neon extension schema")), + }, + Operation { + query: String::from("ALTER EXTENSION neon UPDATE"), + comment: Some(String::from("compat/update: update neon extension version")), + }, + ] + .into_iter(); + + Ok(Box::new(operations)) + } + ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation { + query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")), + comment: None, + }))), + ApplySpecPhase::DropRoles => { + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .map(|op| Operation { + query: format!("DROP ROLE IF EXISTS {}", op.name.pg_quote()), + comment: None, + }); + + Ok(Box::new(operations)) + } + } +} diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql new file mode 100644 index 0000000000..7c60690c78 --- /dev/null +++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql @@ -0,0 +1,18 @@ +DO $$ +BEGIN + IF NOT EXISTS( + SELECT 1 + FROM pg_catalog.pg_tables + WHERE tablename = 'health_check' + ) + THEN + CREATE TABLE health_check ( + id serial primary key, + updated_at timestamptz default now() + ); + INSERT INTO health_check VALUES (1, now()) + ON CONFLICT (id) DO UPDATE + SET updated_at = now(); + END IF; +END +$$ \ No newline at end of file diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql new file mode 100644 index 0000000000..3d7b15c590 --- /dev/null +++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql @@ -0,0 +1,12 @@ +DO $$ +DECLARE + query varchar; +BEGIN + FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};' + FROM pg_proc p + JOIN pg_namespace nsp ON p.pronamespace = nsp.oid + WHERE nsp.nspname = 'anon' LOOP + EXECUTE query; + END LOOP; +END +$$; diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql new file mode 100644 index 0000000000..58ebb0690b --- /dev/null +++ b/compute_tools/src/sql/default_grants.sql @@ -0,0 +1,30 @@ +DO +$$ + BEGIN + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) AND + current_setting('server_version_num')::int / 10000 >= 15 + THEN + IF EXISTS( + SELECT rolname + FROM pg_catalog.pg_roles + WHERE rolname = 'web_access' + ) + THEN + GRANT CREATE ON SCHEMA public TO web_access; + END IF; + END IF; + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) + THEN + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; + END IF; + END +$$; \ No newline at end of file diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql new file mode 100644 index 0000000000..fd061a713e --- /dev/null +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -0,0 +1,23 @@ +DO +$$ + DECLARE + schema_owner TEXT; + BEGIN + IF EXISTS( + SELECT nspname + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + ) + THEN + SELECT nspowner::regrole::text + FROM pg_catalog.pg_namespace + WHERE nspname = 'public' + INTO schema_owner; + + IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' + THEN + ALTER SCHEMA public OWNER TO {db_owner}; + END IF; + END IF; + END +$$; \ No newline at end of file diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql new file mode 100644 index 0000000000..6c4343a589 --- /dev/null +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -0,0 +1,12 @@ +DO $$ + BEGIN + IF EXISTS( + SELECT 1 + FROM pg_catalog.pg_database + WHERE datname = {datname_str} + ) + THEN + ALTER DATABASE {datname} is_template false; + END IF; + END +$$; \ No newline at end of file diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c4063bbd1a..1ea443b026 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1153,6 +1153,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re timeline_info.timeline_id ); } + // TODO: rename to import-basebackup-plus-wal TimelineCmd::Import(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let timeline_id = args.timeline_id; diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 3732bfdab2..1f7e913c07 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -113,21 +113,21 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.9 (the minimal supported version) or greater. +- Install Python 3.11 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.9 + sudo apt install python3.11 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. - - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`. This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 8710904cec..79da05da6c 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -33,6 +33,7 @@ remote_storage.workspace = true postgres_backend.workspace = true nix = {workspace = true, optional = true} reqwest.workspace = true +rand.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index f48c1febb5..7666728427 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -97,6 +97,15 @@ pub struct ConfigToml { pub control_plane_api: Option, pub control_plane_api_token: Option, pub control_plane_emergency_mode: bool, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api_token: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_aws_endpoint_url: Option, pub heatmap_upload_concurrency: usize, pub secondary_download_concurrency: usize, pub virtual_file_io_engine: Option, @@ -109,6 +118,8 @@ pub struct ConfigToml { pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, + #[serde(with = "humantime_serde")] + pub server_side_batch_timeout: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -317,6 +328,8 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; + + pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None; } impl Default for ConfigToml { @@ -382,6 +395,10 @@ impl Default for ConfigToml { control_plane_api_token: (None), control_plane_emergency_mode: (false), + import_pgdata_upcall_api: (None), + import_pgdata_upcall_api_token: (None), + import_pgdata_aws_endpoint_url: (None), + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), @@ -397,6 +414,8 @@ impl Default for ConfigToml { ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, + server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT + .map(|duration| humantime::parse_duration(duration).unwrap()), tenant_config: TenantConfigToml::default(), no_sync: None, } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 401887d362..c55b9e9484 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -48,7 +48,7 @@ pub struct ShardedRange<'a> { // Calculate the size of a range within the blocks of the same relation, or spanning only the // top page in the previous relation's space. -fn contiguous_range_len(range: &Range) -> u32 { +pub fn contiguous_range_len(range: &Range) -> u32 { debug_assert!(is_contiguous_range(range)); if range.start.field6 == 0xffffffff { range.end.field6 + 1 @@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range) -> u32 { /// This matters, because: /// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. /// - Within such ranges, we may calculate distances using simple subtraction of field6. -fn is_contiguous_range(range: &Range) -> bool { +pub fn is_contiguous_range(range: &Range) -> bool { range.start.field1 == range.end.field1 && range.start.field2 == range.end.field2 && range.start.field3 == range.end.field3 diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0dfa1ba817..1b86bfd91a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,6 +2,8 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; pub use utilization::PageserverUtilization; use std::{ @@ -227,6 +229,9 @@ pub enum TimelineCreateRequestMode { // we continue to accept it by having it here. pg_version: Option, }, + ImportPgdata { + import_pgdata: TimelineCreateRequestModeImportPgdata, + }, // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap. // (serde picks the first matching enum variant, in declaration order). Bootstrap { @@ -236,6 +241,42 @@ pub enum TimelineCreateRequestMode { }, } +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineCreateRequestModeImportPgdata { + pub location: ImportPgdataLocation, + pub idempotency_key: ImportPgdataIdempotencyKey, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub enum ImportPgdataLocation { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + /// A better name for this would be `prefix`; changing requires coordination with cplane. + /// See . + key: String, + }, +} + +#[derive(Serialize, Deserialize, Clone)] +#[serde(transparent)] +pub struct ImportPgdataIdempotencyKey(pub String); + +impl ImportPgdataIdempotencyKey { + pub fn random() -> Self { + use rand::{distributions::Alphanumeric, Rng}; + Self( + rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(20) + .map(char::from) + .collect(), + ) + } +} + #[derive(Serialize, Deserialize, Clone)] pub struct LsnLeaseRequest { pub lsn: Lsn, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 7419798a60..8c024375c1 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -716,6 +716,9 @@ impl PostgresBackend { Ok(()) } + // Proto looks like this: + // FeMessage::Query("pagestream_v2{FeMessage::CopyData(PagesetreamFeMessage::GetPage(..))}") + async fn process_message( &mut self, handler: &mut impl Handler, @@ -831,7 +834,7 @@ impl PostgresBackend { use CopyStreamHandlerEnd::*; let expected_end = match &end { - ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true, + ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -871,6 +874,9 @@ impl PostgresBackend { // message from server' when it receives ErrorResponse (anything but // CopyData/CopyDone) back. CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + + // When cancelled, send no response: we must not risk blocking on sending that response + Cancelled => None, _ => None, }; if let Some((err, errcode)) = err_to_send_and_errcode { @@ -1048,6 +1054,8 @@ pub enum CopyStreamHandlerEnd { /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), + #[error("Shutdown")] + Cancelled, /// Some other error #[error(transparent)] Other(#[from] anyhow::Error), diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml new file mode 100644 index 0000000000..1605279bce --- /dev/null +++ b/libs/postgres_initdb/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "postgres_initdb" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +tokio.workspace = true +camino.workspace = true +thiserror.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs new file mode 100644 index 0000000000..2f072354fb --- /dev/null +++ b/libs/postgres_initdb/src/lib.rs @@ -0,0 +1,103 @@ +//! The canonical way we run `initdb` in Neon. +//! +//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations. +//! +//! This module's job is to eliminate the environment-dependence as much as possible. + +use std::fmt; + +use camino::Utf8Path; + +pub struct RunInitdbArgs<'a> { + pub superuser: &'a str, + pub locale: &'a str, + pub initdb_bin: &'a Utf8Path, + pub pg_version: u32, + pub library_search_path: &'a Utf8Path, + pub pgdata: &'a Utf8Path, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + Spawn(std::io::Error), + Failed { + status: std::process::ExitStatus, + stderr: Vec, + }, + WaitOutput(std::io::Error), + Other(anyhow::Error), +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e), + Error::Failed { status, stderr } => write!( + f, + "Command failed with status {:?}: {}", + status, + String::from_utf8_lossy(stderr) + ), + Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e), + Error::Other(e) => write!(f, "Error: {:?}", e), + } + } +} + +pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { + let RunInitdbArgs { + superuser, + locale, + initdb_bin: initdb_bin_path, + pg_version, + library_search_path, + pgdata, + } = args; + let mut initdb_command = tokio::process::Command::new(initdb_bin_path); + initdb_command + .args(["--pgdata", pgdata.as_ref()]) + .args(["--username", superuser]) + .args(["--encoding", "utf8"]) + .args(["--locale", locale]) + .arg("--no-instructions") + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", library_search_path) + .env("DYLD_LIBRARY_PATH", library_search_path) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()); + + // Before version 14, only the libc provide was available. + if pg_version > 14 { + // Version 17 brought with it a builtin locale provider which only provides + // C and C.UTF-8. While being safer for collation purposes since it is + // guaranteed to be consistent throughout a major release, it is also more + // performant. + let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + + initdb_command.args(["--locale-provider", locale_provider]); + } + + let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?; + + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_proc + .wait_with_output() + .await + .map_err(Error::WaitOutput)?; + if !initdb_output.status.success() { + return Err(Error::Failed { + status: initdb_output.status, + stderr: initdb_output.stderr, + }); + } + + Ok(()) +} diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 9ffaaba584..6c40968496 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -185,7 +185,7 @@ pub struct CancelKeyData { impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let hi = (self.backend_pid as u64) << 32; - let lo = self.cancel_key as u64; + let lo = (self.cancel_key as u64) & 0xffffffff; let id = hi | lo; // This format is more compact and might work better for logs. @@ -1046,4 +1046,13 @@ mod tests { let data = [0, 0, 0, 7, 0, 0, 0, 0]; FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err(); } + + #[test] + fn cancel_key_data() { + let key = CancelKeyData { + backend_pid: -1817212860, + cancel_key: -1183897012, + }; + assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)"); + } } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index f98d16789c..ae0a94295c 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -24,6 +24,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl use bytes::Bytes; use futures::future::Either; use futures::stream::Stream; +use futures::FutureExt; use futures_util::StreamExt; use futures_util::TryStreamExt; use http_types::{StatusCode, Url}; @@ -31,6 +32,7 @@ use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; +use utils::backoff::exponential_backoff_duration_seconds; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; use crate::{ @@ -97,10 +99,7 @@ impl AzureBlobStorage { pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_container { Some(prefix) => { if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { @@ -277,19 +276,14 @@ impl RemoteStorage for AzureBlobStorage { cancel: &CancellationToken, ) -> impl Stream> { // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| { + self.prefix_in_container.clone().map(|mut s| { + if !s.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); } - p - }); + s + }) + }); async_stream::stream! { let _permit = self.permit(RequestKind::List, cancel).await?; @@ -310,40 +304,59 @@ impl RemoteStorage for AzureBlobStorage { let mut next_marker = None; + let mut timeout_try_cnt = 1; + 'outer: loop { let mut builder = builder.clone(); if let Some(marker) = next_marker.clone() { builder = builder.marker(marker); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), + // Azure Blob Rust SDK does not expose the list blob API directly. Users have to use + // their pageable iterator wrapper that returns all keys as a stream. We want to have + // full control of paging, and therefore we only take the first item from the stream. + let mut response_stream = builder.into_stream(); + let response = response_stream.next(); + // Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request + // would immediately succeed. Therefore, we use exponential backoff timeout to retry the request. + // (Usually, exponential backoff is used to determine the sleep time between two retries.) We + // start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures. + // timeout = min(5 * (1.0+1.0)^n, self.timeout). + let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64()); + let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response); + let response = response.map(|res| { + match res { + Ok(Some(Ok(res))) => Ok(Some(res)), + Ok(Some(Err(e))) => Err(to_download_error(e)), + Ok(None) => Ok(None), + Err(_elasped) => Err(DownloadError::Timeout), + } }); - - let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); let next_item = tokio::select! { - op = response.next() => Ok(op), + op = response => op, _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &next_item { + timeout_try_cnt += 1; + if timeout_try_cnt <= 5 { + continue; + } + } + + let next_item = next_item?; + + if timeout_try_cnt >= 2 { + tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); + } + timeout_try_cnt = 1; + let Some(entry) = next_item else { // The list is complete, so yield it. break; }; let mut res = Listing::default(); - let entry = match entry { - Ok(entry) => entry, - Err(e) => { - // The error is potentially retryable, so we must rewind the loop after yielding. - yield Err(e); - continue; - } - }; next_marker = entry.continuation(); let prefix_iter = entry .blobs @@ -359,7 +372,7 @@ impl RemoteStorage for AzureBlobStorage { last_modified: k.properties.last_modified.into(), size: k.properties.content_length, } - ); + ); for key in blob_iter { res.keys.push(key); diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index d0e92411da..e99ae4f747 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -26,6 +26,16 @@ pub struct RemoteStorageConfig { pub timeout: Duration, } +impl RemoteStorageKind { + pub fn bucket_name(&self) -> Option<&str> { + match self { + RemoteStorageKind::LocalFs { .. } => None, + RemoteStorageKind::AwsS3(config) => Some(&config.bucket_name), + RemoteStorageKind::AzureContainer(config) => Some(&config.container_name), + } + } +} + fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } @@ -178,6 +188,14 @@ impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { Ok(utils::toml_edit_ext::deserialize_item(toml)?) } + + pub fn from_toml_str(input: &str) -> anyhow::Result { + let toml_document = toml_edit::DocumentMut::from_str(input)?; + if let Some(item) = toml_document.get("remote_storage") { + return Self::from_toml(item); + } + Self::from_toml(toml_document.as_item()) + } } #[cfg(test)] @@ -185,8 +203,7 @@ mod tests { use super::*; fn parse(input: &str) -> anyhow::Result { - let toml = input.parse::().unwrap(); - RemoteStorageConfig::from_toml(toml.as_item()) + RemoteStorageConfig::from_toml_str(input) } #[test] diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 553153826e..ee2fc9d6e2 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs { let mut objects = Vec::with_capacity(keys.len()); for key in keys { let path = key.with_base(&self.storage_root); - let metadata = file_metadata(&path).await?; + let metadata = file_metadata(&path).await; + if let Err(DownloadError::NotFound) = metadata { + // Race: if the file is deleted between listing and metadata check, ignore it. + continue; + } + let metadata = metadata?; if metadata.is_dir() { continue; } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 545317f958..4aad0aee2c 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -29,6 +29,7 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +pprof.workspace = true regex.workspace = true routerify.workspace = true serde.workspace = true diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 93448369a0..a8615c2337 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -50,8 +50,8 @@ REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| declare -i WAL_SIZE=$REDO_POS+114 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate -cp "$DATA_DIR"/pg_wal/000000010000000000000001 . +cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR" cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done -dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc -rm -f 000000010000000000000001 +dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +rm -f "$DATA_DIR"/000000010000000000000001 diff --git a/libs/utils/scripts/restore_from_wal_initdb.sh b/libs/utils/scripts/restore_from_wal_initdb.sh index c6277ebc60..e7b0432505 100755 --- a/libs/utils/scripts/restore_from_wal_initdb.sh +++ b/libs/utils/scripts/restore_from_wal_initdb.sh @@ -14,8 +14,8 @@ REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| declare -i WAL_SIZE=$REDO_POS+114 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate -cp "$DATA_DIR"/pg_wal/000000010000000000000001 . +cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR" cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done -dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc -rm -f 000000010000000000000001 +dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +rm -f "$DATA_DIR"/000000010000000000000001 diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 8ee5abd434..6a85f0ddeb 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,7 +1,8 @@ use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use anyhow::Context; -use hyper::header::{HeaderName, AUTHORIZATION}; +use crate::http::request::{get_query_param, parse_query_param}; +use anyhow::{anyhow, Context}; +use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; @@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; +use std::io::Write as _; use std::str::FromStr; +use std::time::Duration; use bytes::{Bytes, BytesMut}; -use std::io::Write as _; -use tokio::sync::mpsc; +use pprof::protos::Message as _; +use tokio::sync::{mpsc, Mutex}; use tokio_stream::wrappers::ReceiverStream; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { @@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request) -> Result) -> Result, ApiError> { + enum Format { + Pprof, + Svg, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + let seconds = match parse_query_param(&req, "seconds")? { + None => 5, + Some(seconds @ 1..=30) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + }; + let frequency_hz = match parse_query_param(&req, "frequency")? { + None => 99, + Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), + Some(frequency) => frequency, + }; + + // Only allow one profiler at a time. + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + let _lock = PROFILE_LOCK + .try_lock() + .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + + // Take the profile. + let report = tokio::task::spawn_blocking(move || { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(frequency_hz) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build()?; + std::thread::sleep(Duration::from_secs(seconds)); + guard.report().build() + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + + // Return the report in the requested format. + match format { + Format::Pprof => { + let mut body = Vec::new(); + report + .pprof() + .map_err(|err| ApiError::InternalServerError(err.into()))? + .write_to_vec(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Svg => { + let mut body = Vec::new(); + report + .flamegraph(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 6c25440b42..e53231f313 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize}; use super::error::ApiError; +/// Parse a json request body and deserialize it to the type `T`. pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { @@ -27,6 +28,27 @@ pub async fn json_request Deserialize<'de>>( .map_err(ApiError::BadRequest) } +/// Parse a json request body and deserialize it to the type `T`. If the body is empty, return `T::default`. +pub async fn json_request_maybe Deserialize<'de> + Default>( + request: &mut Request, +) -> Result { + let body = hyper::body::aggregate(request.body_mut()) + .await + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; + + if body.remaining() == 0 { + return Ok(T::default()); + } + + let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); + + serde_path_to_error::deserialize(&mut deser) + // intentionally stringify because the debug version is not helpful in python logs + .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) + .map_err(ApiError::BadRequest) +} + pub fn json_response( status: StatusCode, data: T, diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8b8ed5a67f..7ea71685ec 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -30,7 +30,7 @@ pub fn parse_request_param( } } -fn get_query_param<'a>( +pub fn get_query_param<'a>( request: &'a Request, param_name: &str, ) -> Result>, ApiError> { diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 375b227b99..d99dc25769 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -83,7 +83,9 @@ where } wake_these.push(self.heap.pop().unwrap().wake_channel); } - self.update_status(); + if !wake_these.is_empty() { + self.update_status(); + } wake_these } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ecb8fa7491..140b287ccc 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -43,6 +43,7 @@ postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true +postgres_initdb.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -68,6 +69,7 @@ url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true @@ -84,6 +86,7 @@ enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true wal_decoder.workspace = true +smallvec.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index f6b2a8e031..caacd365b3 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -167,6 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) { 16384, virtual_file::io_engine_for_bench(), conf.virtual_file_io_mode, + virtual_file::SyncMode::Sync, ); page_cache::init(conf.page_cache_size); diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index a753f806a0..39ca47568c 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -18,7 +18,6 @@ postgres_ffi.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true -toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 11b8e98f57..2c350d6d86 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -138,6 +138,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), + virtual_file::SyncMode::Sync, ); pageserver::page_cache::init(100); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 6f543dcaa9..4c2c3ab30e 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -51,6 +51,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), + virtual_file::SyncMode::Sync, ); page_cache::init(100); let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); @@ -65,6 +66,7 @@ async fn read_image_file(path: impl AsRef, ctx: &RequestContext) -> Result 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), + virtual_file::SyncMode::Sync, ); page_cache::init(100); let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); @@ -171,6 +173,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), + virtual_file::SyncMode::Sync, ); pageserver::page_cache::init(100); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index f506caec5b..a0aac89dc8 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -174,11 +174,7 @@ async fn main() -> anyhow::Result<()> { println!("specified prefix '{}' failed validation", cmd.prefix); return Ok(()); }; - let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?; - let toml_item = toml_document - .get("remote_storage") - .expect("need remote_storage"); - let config = RemoteStorageConfig::from_toml(toml_item)?; + let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); storage @@ -209,6 +205,7 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { 10, virtual_file::api::IoEngineKind::StdFs, IoMode::preferred(), + virtual_file::SyncMode::Sync, ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fe2a31167d..033a9a4619 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -171,11 +171,18 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup + tracing::info!("Initializing virtual_file..."); virtual_file::init( conf.max_file_descriptors, conf.virtual_file_io_engine, conf.virtual_file_io_mode, + if conf.no_sync { + virtual_file::SyncMode::UnsafeNoSync + } else { + virtual_file::SyncMode::Sync + }, ); + tracing::info!("Initializing page_cache..."); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b694a43599..59ea6fb941 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -144,6 +144,10 @@ pub struct PageServerConf { /// JWT token for use with the control plane API. pub control_plane_api_token: Option, + pub import_pgdata_upcall_api: Option, + pub import_pgdata_upcall_api_token: Option, + pub import_pgdata_aws_endpoint_url: Option, + /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, @@ -182,6 +186,10 @@ pub struct PageServerConf { /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, + + /// Maximum amount of time for which a get page request request + /// might be held up for request merging. + pub server_side_batch_timeout: Option, } /// Token for authentication to safekeepers @@ -324,6 +332,9 @@ impl PageServerConf { control_plane_api, control_plane_api_token, control_plane_emergency_mode, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token, + import_pgdata_aws_endpoint_url, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, @@ -336,6 +347,7 @@ impl PageServerConf { concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, + server_side_batch_timeout, tenant_config, no_sync, } = config_toml; @@ -377,6 +389,10 @@ impl PageServerConf { image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, + server_side_batch_timeout, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), + import_pgdata_aws_endpoint_url, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 1f04bc0410..3d02387c98 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; use utils::backoff; +use utils::pausable_failpoint; use crate::metrics; @@ -90,6 +91,7 @@ impl Deleter { /// Block until everything in accumulator has been executed async fn flush(&mut self) -> Result<(), DeletionQueueError> { while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { + pausable_failpoint!("deletion-queue-before-execute-pause"); match self.remote_delete().await { Ok(()) => { // Note: we assume that the remote storage layer returns Ok(()) if some diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2bc7f5ad39..7fb9247feb 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -623,6 +623,8 @@ paths: existing_initdb_timeline_id: type: string format: hex + import_pgdata: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdata" responses: "201": description: Timeline was created, or already existed with matching parameters @@ -979,6 +981,34 @@ components: $ref: "#/components/schemas/TenantConfig" effective_config: $ref: "#/components/schemas/TenantConfig" + TimelineCreateRequestImportPgdata: + type: object + required: + - location + - idempotency_key + properties: + idempotency_key: + type: string + location: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation" + TimelineCreateRequestImportPgdataLocation: + type: object + properties: + AwsS3: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3" + TimelineCreateRequestImportPgdataLocationAwsS3: + type: object + properties: + region: + type: string + bucket: + type: string + key: + type: string + required: + - region + - bucket + - key TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ab170679ba..ceb1c3b012 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TimelineCreateRequestMode; +use pageserver_api::models::TimelineCreateRequestModeImportPgdata; use pageserver_api::models::TimelinesInfoAndOffloaded; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; @@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::profile_cpu_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::request::must_parse_query_param; @@ -80,9 +82,12 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::offload::OffloadError; use crate::tenant::timeline::CompactFlags; +use crate::tenant::timeline::CompactOptions; +use crate::tenant::timeline::CompactRange; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; @@ -100,7 +105,7 @@ use utils::{ http::{ endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, error::{ApiError, HttpErrorBody}, - json::{json_request, json_response}, + json::{json_request, json_request_maybe, json_response}, request::parse_request_param, RequestExt, RouterBuilder, }, @@ -123,7 +128,7 @@ pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, @@ -144,10 +149,13 @@ impl State { deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, ) -> anyhow::Result { - let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); + let allowlist_routes = &[ + "/v1/status", + "/v1/doc", + "/swagger.yml", + "/metrics", + "/profile/cpu", + ]; Ok(Self { conf, tenant_manager, @@ -574,6 +582,35 @@ async fn timeline_create_handler( ancestor_timeline_id, ancestor_start_lsn, }), + TimelineCreateRequestMode::ImportPgdata { + import_pgdata: + TimelineCreateRequestModeImportPgdata { + location, + idempotency_key, + }, + } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new( + idempotency_key.0, + ), + new_timeline_id, + location: { + use import_pgdata::index_part_format::Location; + use pageserver_api::models::ImportPgdataLocation; + match location { + #[cfg(feature = "testing")] + ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path }, + ImportPgdataLocation::AwsS3 { + region, + bucket, + key, + } => Location::AwsS3 { + region, + bucket, + key, + }, + } + }, + }), }; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); @@ -1927,13 +1964,15 @@ async fn timeline_gc_handler( // Run compaction immediately on given timeline. async fn timeline_compact_handler( - request: Request, + mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let compact_range = json_request_maybe::>(&mut request).await?; + let state = get_state(&request); let mut flags = EnumSet::empty(); @@ -1957,11 +1996,16 @@ async fn timeline_compact_handler( let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); + let options = CompactOptions { + compact_range, + flags, + }; + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline - .compact(&cancel, flags, &ctx) + .compact_with_options(&cancel, options, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; if wait_until_uploaded { @@ -3139,7 +3183,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -3158,6 +3202,7 @@ pub fn make_router( Ok(router .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1473729186..3cdc2a761e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1187,6 +1187,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { ctx: &'c RequestContext, start: std::time::Instant, op: SmgrQueryType, + count: usize, } impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { @@ -1214,10 +1215,13 @@ impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { elapsed } }; - self.global_latency_histo - .observe(ex_throttled.as_secs_f64()); - if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo { - per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64()); + + for _ in 0..self.count { + self.global_latency_histo + .observe(ex_throttled.as_secs_f64()); + if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo { + per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64()); + } } } } @@ -1385,6 +1389,14 @@ impl SmgrQueryTimePerTimeline { &'a self, op: SmgrQueryType, ctx: &'c RequestContext, + ) -> Option { + self.start_timer_many(op, 1, ctx) + } + pub(crate) fn start_timer_many<'c: 'a, 'a>( + &'a self, + op: SmgrQueryType, + count: usize, + ctx: &'c RequestContext, ) -> Option { let start = Instant::now(); @@ -1422,6 +1434,7 @@ impl SmgrQueryTimePerTimeline { ctx, start, op, + count, }) } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f07474df6a..5fd02d8749 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,13 +7,13 @@ use bytes::Buf; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pageserver_api::models::TenantState; +use pageserver_api::models::{self, TenantState}; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, - PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, - PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, - PagestreamNblocksResponse, PagestreamProtocolVersion, + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, + PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, + PagestreamProtocolVersion, }; use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; @@ -44,7 +44,7 @@ use crate::basebackup; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics; +use crate::metrics::{self}; use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -59,7 +59,7 @@ use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use pageserver_api::key::rel_block_to_key; -use pageserver_api::reltag::SlruKind; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -105,6 +105,7 @@ pub fn spawn( pg_auth, tcp_listener, conf.pg_auth_type, + conf.server_side_batch_timeout, libpq_ctx, cancel.clone(), ) @@ -153,6 +154,7 @@ pub async fn libpq_listener_main( auth: Option>, listener: tokio::net::TcpListener, auth_type: AuthType, + server_side_batch_timeout: Option, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -183,6 +185,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, + server_side_batch_timeout, connection_ctx, connections_cancel.child_token(), )); @@ -210,6 +213,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + server_side_batch_timeout: Option, connection_ctx: RequestContext, cancel: CancellationToken, ) -> ConnectionHandlerResult { @@ -260,8 +264,13 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = - PageServerHandler::new(tenant_manager, auth, connection_ctx, cancel.clone()); + let mut conn_handler = PageServerHandler::new( + tenant_manager, + auth, + server_side_batch_timeout, + connection_ctx, + cancel.clone(), + ); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend.run(&mut conn_handler, &cancel).await { @@ -304,6 +313,12 @@ struct PageServerHandler { cancel: CancellationToken, timeline_handles: TimelineHandles, + + /// Messages queued up for the next processing batch + next_batch: Option, + + /// See [`PageServerConf::server_side_batch_timeout`] + server_side_batch_timeout: Option, } struct TimelineHandles { @@ -517,10 +532,47 @@ impl From for QueryError { } } +enum BatchedFeMessage { + Exists { + span: Span, + req: models::PagestreamExistsRequest, + }, + Nblocks { + span: Span, + req: models::PagestreamNblocksRequest, + }, + GetPage { + span: Span, + shard: timeline::handle::Handle, + effective_request_lsn: Lsn, + pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + }, + DbSize { + span: Span, + req: models::PagestreamDbSizeRequest, + }, + GetSlruSegment { + span: Span, + req: models::PagestreamGetSlruSegmentRequest, + }, + RespondError { + span: Span, + error: PageStreamError, + }, +} + +enum BatchOrEof { + /// In the common case, this has one entry. + /// At most, it has two entries: the first is the leftover batch, the second is an error. + Batch(smallvec::SmallVec<[BatchedFeMessage; 1]>), + Eof, +} + impl PageServerHandler { pub fn new( tenant_manager: Arc, auth: Option>, + server_side_batch_timeout: Option, connection_ctx: RequestContext, cancel: CancellationToken, ) -> Self { @@ -530,6 +582,8 @@ impl PageServerHandler { connection_ctx, timeline_handles: TimelineHandles::new(tenant_manager), cancel, + next_batch: None, + server_side_batch_timeout, } } @@ -557,6 +611,221 @@ impl PageServerHandler { ) } + async fn read_batch_from_connection( + &mut self, + pgb: &mut PostgresBackend, + tenant_id: &TenantId, + timeline_id: &TimelineId, + ctx: &RequestContext, + ) -> Result, QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let mut batch = self.next_batch.take(); + let mut batch_started_at: Option = None; + + let next_batch: Option = loop { + let sleep_fut = match (self.server_side_batch_timeout, batch_started_at) { + (Some(batch_timeout), Some(started_at)) => futures::future::Either::Left( + tokio::time::sleep_until((started_at + batch_timeout).into()), + ), + _ => futures::future::Either::Right(futures::future::pending()), + }; + + let msg = tokio::select! { + biased; + _ = self.cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + msg = pgb.read_message() => { + msg + } + _ = sleep_fut => { + assert!(batch.is_some()); + break None; + } + }; + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(FeMessage::Terminate) => { + return Ok(Some(BatchOrEof::Eof)); + } + Some(m) => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message: {m:?} during COPY" + ))); + } + None => { + return Ok(Some(BatchOrEof::Eof)); + } // client disconnected + }; + trace!("query: {copy_data_bytes:?}"); + fail::fail_point!("ps::handle-pagerequest-message"); + + // parse request + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + + let this_msg = match neon_fe_msg { + PagestreamFeMessage::Exists(req) => BatchedFeMessage::Exists { + span: tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn), + req, + }, + PagestreamFeMessage::Nblocks(req) => BatchedFeMessage::Nblocks { + span: tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn), + req, + }, + PagestreamFeMessage::DbSize(req) => BatchedFeMessage::DbSize { + span: tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn), + req, + }, + PagestreamFeMessage::GetSlruSegment(req) => BatchedFeMessage::GetSlruSegment { + span: tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn), + req, + }, + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + request_lsn, + not_modified_since, + rel, + blkno, + }) => { + // shard_id is filled in by the handler + let span = tracing::info_span!( + "handle_get_page_at_lsn_request_batched", + %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, + batch_size = tracing::field::Empty, batch_id = tracing::field::Empty + ); + + macro_rules! current_batch_and_error { + ($error:expr) => {{ + let error = BatchedFeMessage::RespondError { + span, + error: $error, + }; + let batch_and_error = match batch { + Some(b) => smallvec::smallvec![b, error], + None => smallvec::smallvec![error], + }; + Ok(Some(BatchOrEof::Batch(batch_and_error))) + }}; + } + + let key = rel_block_to_key(rel, blkno); + let shard = match self + .timeline_handles + .get(*tenant_id, *timeline_id, ShardSelector::Page(key)) + .instrument(span.clone()) + .await + { + Ok(tl) => tl, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return current_batch_and_error!(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + )); + } + Err(e) => { + return current_batch_and_error!(e.into()); + } + }; + let effective_request_lsn = match Self::wait_or_get_last_lsn( + &shard, + request_lsn, + not_modified_since, + &shard.get_latest_gc_cutoff_lsn(), + ctx, + ) + // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait + .await + { + Ok(lsn) => lsn, + Err(e) => { + return current_batch_and_error!(e); + } + }; + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages: smallvec::smallvec![(rel, blkno)], + } + } + }; + + let batch_timeout = match self.server_side_batch_timeout { + Some(value) => value, + None => { + // Batching is not enabled - stop on the first message. + return Ok(Some(BatchOrEof::Batch(smallvec::smallvec![this_msg]))); + } + }; + + // check if we can batch + match (&mut batch, this_msg) { + (None, this_msg) => { + batch = Some(this_msg); + } + ( + Some(BatchedFeMessage::GetPage { + span: _, + shard: accum_shard, + pages: accum_pages, + effective_request_lsn: accum_lsn, + }), + BatchedFeMessage::GetPage { + span: _, + shard: this_shard, + pages: this_pages, + effective_request_lsn: this_lsn, + }, + ) if async { + assert_eq!(this_pages.len(), 1); + if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize { + assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize); + return false; + } + if (accum_shard.tenant_shard_id, accum_shard.timeline_id) + != (this_shard.tenant_shard_id, this_shard.timeline_id) + { + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + // the vectored get currently only supports a single LSN, so, bounce as soon + // as the effective request_lsn changes + if *accum_lsn != this_lsn { + return false; + } + true + } + .await => + { + // ok to batch + accum_pages.extend(this_pages); + } + (Some(_), this_msg) => { + // by default, don't continue batching + break Some(this_msg); + } + } + + // batching impl piece + let started_at = batch_started_at.get_or_insert_with(Instant::now); + if started_at.elapsed() > batch_timeout { + break None; + } + }; + + self.next_batch = next_batch; + Ok(batch.map(|b| BatchOrEof::Batch(smallvec::smallvec![b]))) + } + /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. @@ -592,133 +861,165 @@ impl PageServerHandler { } } + // If [`PageServerHandler`] is reused for multiple pagestreams, + // then make sure to not process requests from the previous ones. + self.next_batch = None; + loop { - // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData) - let msg = tokio::select! { - biased; - _ = self.cancel.cancelled() => { - return Err(QueryError::Shutdown) + let maybe_batched = self + .read_batch_from_connection(pgb, &tenant_id, &timeline_id, &ctx) + .await?; + let batched = match maybe_batched { + Some(BatchOrEof::Batch(b)) => b, + Some(BatchOrEof::Eof) => { + break; } - msg = pgb.read_message() => { msg } - }; - let copy_data_bytes = match msg? { - Some(FeMessage::CopyData(bytes)) => bytes, - Some(FeMessage::Terminate) => break, - Some(m) => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message: {m:?} during COPY" - ))); - } - None => break, // client disconnected - }; - - trace!("query: {copy_data_bytes:?}"); - fail::fail_point!("ps::handle-pagerequest-message"); - - // parse request - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; - - // invoke handler function - let (handler_result, span) = match neon_fe_msg { - PagestreamFeMessage::Exists(req) => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); - ( - self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - span, - ) - } - PagestreamFeMessage::Nblocks(req) => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); - ( - self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - span, - ) - } - PagestreamFeMessage::GetPage(req) => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - // shard_id is filled in by the handler - let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); - ( - self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - span, - ) - } - PagestreamFeMessage::DbSize(req) => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); - ( - self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - span, - ) - } - PagestreamFeMessage::GetSlruSegment(req) => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); - ( - self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - span, - ) + None => { + continue; } }; - // Map handler result to protocol behavior. - // Some handler errors cause exit from pagestream protocol. - // Other handler errors are sent back as an error message and we stay in pagestream protocol. - let response_msg = match handler_result { - Err(e) => match &e { - PageStreamError::Shutdown => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); + for batch in batched { + // invoke handler function + let (handler_results, span): ( + Vec>, + _, + ) = match batch { + BatchedFeMessage::Exists { span, req } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![ + self.handle_get_rel_exists_request( + tenant_id, + timeline_id, + &req, + &ctx, + ) + .instrument(span.clone()) + .await, + ], + span, + ) } - PageStreamError::Reconnect(reason) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); + BatchedFeMessage::Nblocks { span, req } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![ + self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) + .instrument(span.clone()) + .await, + ], + span, + ) } - PageStreamError::Read(_) - | PageStreamError::LsnTimeout(_) - | PageStreamError::NotFound(_) - | PageStreamError::BadRequest(_) => { - // print the all details to the log with {:#}, but for the client the - // error message is enough. Do not log if shutting down, as the anyhow::Error - // here includes cancellation which is not an error. - let full = utils::error::report_compact_sources(&e); - span.in_scope(|| { - error!("error reading relation or page version: {full:#}") - }); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + let res = self + .handle_get_page_at_lsn_request_batched( + &shard, + effective_request_lsn, + pages, + &ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) } - }, - Ok(response_msg) => response_msg, - }; + BatchedFeMessage::DbSize { span, req } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![ + self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) + .instrument(span.clone()) + .await, + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { span, req } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![ + self.handle_get_slru_segment_request( + tenant_id, + timeline_id, + &req, + &ctx, + ) + .instrument(span.clone()) + .await, + ], + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }; - // marshal & transmit response message - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; - tokio::select! { - biased; - _ = self.cancel.cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); - return Err(QueryError::Shutdown) + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + for handler_result in handler_results { + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { + // print the all details to the log with {:#}, but for the client the + // error message is enough. Do not log if shutting down, as the anyhow::Error + // here includes cancellation which is not an error. + let full = utils::error::report_compact_sources(&e); + span.in_scope(|| { + error!("error reading relation or page version: {full:#}") + }); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + } + }, + Ok(response_msg) => response_msg, + }; + + // marshal & transmit response message + pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; } - res = pgb.flush() => { - res?; + tokio::select! { + biased; + _ = self.cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) + } + res = pgb.flush() => { + res?; + } } } } @@ -767,21 +1068,26 @@ impl PageServerHandler { )); } - if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus + if request_lsn == Lsn::INVALID { + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); + } + + // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. + // + // We may have older data available, but we make a best effort to detect this case and return an error, + // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). + if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); if !gc_info.leases.contains_key(&request_lsn) { - // The requested LSN is below gc cutoff and is not guarded by a lease. - - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { + return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + ); } } @@ -964,60 +1270,30 @@ impl PageServerHandler { })) } - #[instrument(skip_all, fields(shard_id))] - async fn handle_get_page_at_lsn_request( + #[instrument(skip_all)] + async fn handle_get_page_at_lsn_request_batched( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, - req: &PagestreamGetPageRequest, + timeline: &Timeline, + effective_lsn: Lsn, + pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, ctx: &RequestContext, - ) -> Result { - let timeline = match self - .timeline_handles - .get( - tenant_id, - timeline_id, - ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)), - ) - .await - { - Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return Err(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into(), - )); - } - Err(e) => return Err(e.into()), - }; - - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); - - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn( - &timeline, - req.request_lsn, - req.not_modified_since, - &latest_gc_cutoff_lsn, + ) -> Vec> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let _timer = timeline.query_metrics.start_timer_many( + metrics::SmgrQueryType::GetPageAtLsn, + pages.len(), ctx, - ) - .await?; + ); - let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx) - .await?; + let pages = timeline + .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + .await; - Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { - page, + Vec::from_iter(pages.into_iter().map(|page| { + page.map(|page| { + PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page }) + }) + .map_err(PageStreamError::from) })) } @@ -1674,6 +1950,13 @@ fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id(); } +struct WaitedForLsn(Lsn); +impl From for Lsn { + fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self { + lsn + } +} + #[cfg(test)] mod tests { use utils::shard::ShardCount; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7c1abbf3e2..f4f184be5a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -10,10 +10,15 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; +use crate::span::{ + debug_assert_current_span_has_tenant_and_timeline_id, + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, +}; +use crate::tenant::timeline::GetVectoredError; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; +use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, @@ -30,7 +35,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, HashMap, HashSet}; +use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; @@ -193,26 +198,195 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { - if tag.relnode == 0 { - return Err(PageReconstructError::Other( - RelationError::InvalidRelnode.into(), - )); + match version { + Version::Lsn(effective_lsn) => { + let pages = smallvec::smallvec![(tag, blknum)]; + let res = self + .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + .await; + assert_eq!(res.len(), 1); + res.into_iter().next().unwrap() + } + Version::Modified(modification) => { + if tag.relnode == 0 { + return Err(PageReconstructError::Other( + RelationError::InvalidRelnode.into(), + )); + } + + let nblocks = self.get_rel_size(tag, version, ctx).await?; + if blknum >= nblocks { + debug!( + "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", + tag, + blknum, + version.get_lsn(), + nblocks + ); + return Ok(ZERO_PAGE.clone()); + } + + let key = rel_block_to_key(tag, blknum); + modification.get(key, ctx).await + } + } + } + + /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages. + /// + /// The ordering of the returned vec corresponds to the ordering of `pages`. + pub(crate) async fn get_rel_page_at_lsn_batched( + &self, + pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + effective_lsn: Lsn, + ctx: &RequestContext, + ) -> Vec> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + let mut slots_filled = 0; + let page_count = pages.len(); + + // Would be nice to use smallvec here but it doesn't provide the spare_capacity_mut() API. + let mut result = Vec::with_capacity(pages.len()); + let result_slots = result.spare_capacity_mut(); + + let mut keys_slots: BTreeMap> = BTreeMap::default(); + for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() { + if tag.relnode == 0 { + result_slots[response_slot_idx].write(Err(PageReconstructError::Other( + RelationError::InvalidRelnode.into(), + ))); + + slots_filled += 1; + continue; + } + + let nblocks = match self + .get_rel_size(tag, Version::Lsn(effective_lsn), ctx) + .await + { + Ok(nblocks) => nblocks, + Err(err) => { + result_slots[response_slot_idx].write(Err(err)); + slots_filled += 1; + continue; + } + }; + + if blknum >= nblocks { + debug!( + "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", + tag, blknum, effective_lsn, nblocks + ); + result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone())); + slots_filled += 1; + continue; + } + + let key = rel_block_to_key(tag, blknum); + + let key_slots = keys_slots.entry(key).or_default(); + key_slots.push(response_slot_idx); } - let nblocks = self.get_rel_size(tag, version, ctx).await?; - if blknum >= nblocks { - debug!( - "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, - blknum, - version.get_lsn(), - nblocks - ); - return Ok(ZERO_PAGE.clone()); + let keyspace = { + // add_key requires monotonicity + let mut acc = KeySpaceAccum::new(); + for key in keys_slots + .keys() + // in fact it requires strong monotonicity + .dedup() + { + acc.add_key(*key); + } + acc.to_keyspace() + }; + + match self.get_vectored(keyspace, effective_lsn, ctx).await { + Ok(results) => { + for (key, res) in results { + let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); + let first_slot = key_slots.next().unwrap(); + + for slot in key_slots { + let clone = match &res { + Ok(buf) => Ok(buf.clone()), + Err(err) => Err(match err { + PageReconstructError::Cancelled => { + PageReconstructError::Cancelled + } + + x @ PageReconstructError::Other(_) | + x @ PageReconstructError::AncestorLsnTimeout(_) | + x @ PageReconstructError::WalRedo(_) | + x @ PageReconstructError::MissingKey(_) => { + PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}")) + }, + }), + }; + + result_slots[slot].write(clone); + slots_filled += 1; + } + + result_slots[first_slot].write(res); + slots_filled += 1; + } + } + Err(err) => { + // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size + // (We enforce the max batch size outside of this function, in the code that constructs the batch request.) + for slot in keys_slots.values().flatten() { + // this whole `match` is a lot like `From for PageReconstructError` + // but without taking ownership of the GetVectoredError + let err = match &err { + GetVectoredError::Cancelled => { + Err(PageReconstructError::Cancelled) + } + // TODO: restructure get_vectored API to make this error per-key + GetVectoredError::MissingKey(err) => { + Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))) + } + // TODO: restructure get_vectored API to make this error per-key + GetVectoredError::GetReadyAncestorError(err) => { + Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))) + } + // TODO: restructure get_vectored API to make this error per-key + GetVectoredError::Other(err) => { + Err(PageReconstructError::Other( + anyhow::anyhow!("whole vectored get request failed: {err:?}"), + )) + } + // TODO: we can prevent this error class by moving this check into the type system + GetVectoredError::InvalidLsn(e) => { + Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) + } + // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS + // TODO: we can prevent this error class by moving this check into the type system + GetVectoredError::Oversized(err) => { + Err(anyhow::anyhow!( + "batching oversized: {err:?}" + ) + .into()) + } + }; + + result_slots[*slot].write(err); + } + + slots_filled += keys_slots.values().map(|slots| slots.len()).sum::(); + } + }; + + assert_eq!(slots_filled, page_count); + // SAFETY: + // 1. `result` and any of its uninint members are not read from until this point + // 2. The length below is tracked at run-time and matches the number of requested pages. + unsafe { + result.set_len(page_count); } - let key = rel_block_to_key(tag, blknum); - version.get(self, key, ctx).await + result } // Get size of a database in blocks @@ -2102,9 +2276,9 @@ impl<'a> Version<'a> { //--- Metadata structs stored in key-value pairs in the repository. #[derive(Debug, Serialize, Deserialize)] -struct DbDirectory { +pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) - dbdirs: HashMap<(Oid, Oid), bool>, + pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of @@ -2113,8 +2287,8 @@ struct DbDirectory { // "pg_twophsae/0000000A000002E4". #[derive(Debug, Serialize, Deserialize)] -struct TwoPhaseDirectory { - xids: HashSet, +pub(crate) struct TwoPhaseDirectory { + pub(crate) xids: HashSet, } #[derive(Debug, Serialize, Deserialize)] @@ -2123,12 +2297,12 @@ struct TwoPhaseDirectoryV17 { } #[derive(Debug, Serialize, Deserialize, Default)] -struct RelDirectory { +pub(crate) struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) // // TODO: Store it as a btree or radix tree or something else that spans multiple // key-value pairs, if you have a lot of relations - rels: HashSet<(Oid, u8)>, + pub(crate) rels: HashSet<(Oid, u8)>, } #[derive(Debug, Serialize, Deserialize)] @@ -2137,9 +2311,9 @@ struct RelSizeEntry { } #[derive(Debug, Serialize, Deserialize, Default)] -struct SlruSegmentDirectory { +pub(crate) struct SlruSegmentDirectory { // Set of SLRU segments that exist. - segments: HashSet, + pub(crate) segments: HashSet, } #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 6a4e90dd55..622738022a 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -381,6 +381,8 @@ pub enum TaskKind { UnitTest, DetachAncestor, + + ImportPgdata, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 909f99ea9d..0214ee68fa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; +use timeline::import_pgdata; use timeline::offload::offload_timeline; +use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; use tokio::task::JoinSet; @@ -189,6 +191,7 @@ pub struct TenantSharedResources { /// A [`Tenant`] is really an _attached_ tenant. The configuration /// for an attached tenant is a subset of the [`LocationConf`], represented /// in this struct. +#[derive(Clone)] pub(super) struct AttachedTenantConf { tenant_conf: TenantConfOpt, location: AttachedLocationConfig, @@ -249,7 +252,8 @@ struct TimelinePreload { pub(crate) struct TenantPreload { tenant_manifest: TenantManifest, - timelines: HashMap, + /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest. + timelines: HashMap>, } /// When we spawn a tenant, there is a special mode for tenant creation that @@ -371,7 +375,6 @@ pub struct Tenant { l0_flush_global_state: L0FlushGlobalState, } - impl std::fmt::Debug for Tenant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) @@ -608,11 +611,15 @@ impl OffloadedTimeline { .iter() .find(|(tid, _tl)| **tid == ancestor_timeline_id) { - ancestor_timeline + let removal_happened = ancestor_timeline .gc_info .write() .unwrap() .remove_child_offloaded(self.timeline_id); + if !removal_happened { + tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, + "Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed"); + } } } self.deleted_from_ancestor.store(true, Ordering::Release); @@ -854,6 +861,7 @@ impl Debug for SetStoppingError { pub(crate) enum CreateTimelineParams { Bootstrap(CreateTimelineParamsBootstrap), Branch(CreateTimelineParamsBranch), + ImportPgdata(CreateTimelineParamsImportPgdata), } #[derive(Debug)] @@ -871,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch { pub(crate) ancestor_start_lsn: Option, } -/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`]. +#[derive(Debug)] +pub(crate) struct CreateTimelineParamsImportPgdata { + pub(crate) new_timeline_id: TimelineId, + pub(crate) location: import_pgdata::index_part_format::Location, + pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey, +} + +/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`] in [`Tenant::start_creating_timeline`]. /// /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`]. /// @@ -901,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency { ancestor_timeline_id: TimelineId, ancestor_start_lsn: Lsn, }, + ImportPgdata(CreatingTimelineIdempotencyImportPgdata), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey, } /// What is returned by [`Tenant::start_creating_timeline`]. #[must_use] -enum StartCreatingTimelineResult<'t> { - CreateGuard(TimelineCreateGuard<'t>), +enum StartCreatingTimelineResult { + CreateGuard(TimelineCreateGuard), Idempotent(Arc), } +enum TimelineInitAndSyncResult { + ReadyToActivate(Arc), + NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), +} + +impl TimelineInitAndSyncResult { + fn ready_to_activate(self) -> Option> { + match self { + Self::ReadyToActivate(timeline) => Some(timeline), + _ => None, + } + } +} + +#[must_use] +struct TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline: Arc, + import_pgdata: import_pgdata::index_part_format::Root, + guard: TimelineCreateGuard, +} + /// What is returned by [`Tenant::create_timeline`]. enum CreateTimelineResult { Created(Arc), Idempotent(Arc), + /// IMPORTANT: This [`Arc`] object is not in [`Tenant::timelines`] when + /// we return this result, nor will this concrete object ever be added there. + /// Cf method comment on [`Tenant::create_timeline_import_pgdata`]. + ImportSpawned(Arc), } impl CreateTimelineResult { @@ -921,18 +967,19 @@ impl CreateTimelineResult { match self { Self::Created(_) => "Created", Self::Idempotent(_) => "Idempotent", + Self::ImportSpawned(_) => "ImportSpawned", } } fn timeline(&self) -> &Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } /// Unit test timelines aren't activated, test has to do it if it needs to. #[cfg(test)] fn into_timeline_for_test(self) -> Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } } @@ -956,33 +1003,13 @@ pub enum CreateTimelineError { } #[derive(thiserror::Error, Debug)] -enum InitdbError { - Other(anyhow::Error), +pub enum InitdbError { + #[error("Operation was cancelled")] Cancelled, - Spawn(std::io::Result<()>), - Failed(std::process::ExitStatus, Vec), -} - -impl fmt::Display for InitdbError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - InitdbError::Cancelled => write!(f, "Operation was cancelled"), - InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e), - InitdbError::Failed(status, stderr) => write!( - f, - "Command failed with status {:?}: {}", - status, - String::from_utf8_lossy(stderr) - ), - InitdbError::Other(e) => write!(f, "Error: {:?}", e), - } - } -} - -impl From for InitdbError { - fn from(error: std::io::Error) -> Self { - InitdbError::Spawn(Err(error)) - } + #[error(transparent)] + Other(anyhow::Error), + #[error(transparent)] + Inner(postgres_initdb::Error), } enum CreateTimelineCause { @@ -990,6 +1017,15 @@ enum CreateTimelineCause { Delete, } +enum LoadTimelineCause { + Attach, + Unoffload, + ImportPgdata { + create_guard: TimelineCreateGuard, + activate: ActivateTimelineArgs, + }, +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GcError { // The tenant is shutting down @@ -1066,24 +1102,35 @@ impl Tenant { /// it is marked as Active. #[allow(clippy::too_many_arguments)] async fn timeline_init_and_sync( - &self, + self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - index_part: IndexPart, + mut index_part: IndexPart, metadata: TimelineMetadata, ancestor: Option>, - _ctx: &RequestContext, - ) -> anyhow::Result<()> { + cause: LoadTimelineCause, + ctx: &RequestContext, + ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let idempotency = if metadata.ancestor_timeline().is_none() { - CreateTimelineIdempotency::Bootstrap { - pg_version: metadata.pg_version(), + let import_pgdata = index_part.import_pgdata.take(); + let idempotency = match &import_pgdata { + Some(import_pgdata) => { + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata.idempotency_key().clone(), + }) } - } else { - CreateTimelineIdempotency::Branch { - ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), - ancestor_start_lsn: metadata.ancestor_lsn(), + None => { + if metadata.ancestor_timeline().is_none() { + CreateTimelineIdempotency::Bootstrap { + pg_version: metadata.pg_version(), + } + } else { + CreateTimelineIdempotency::Branch { + ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), + ancestor_start_lsn: metadata.ancestor_lsn(), + } + } } }; @@ -1115,39 +1162,91 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; - { - // avoiding holding it across awaits - let mut timelines_accessor = self.timelines.lock().unwrap(); - match timelines_accessor.entry(timeline_id) { - // We should never try and load the same timeline twice during startup - Entry::Occupied(_) => { - unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + match import_pgdata { + Some(import_pgdata) if !import_pgdata.is_done() => { + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { .. } => { + unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + } } - Entry::Vacant(v) => { - v.insert(Arc::clone(&timeline)); - timeline.maybe_spawn_flush_loop(); + let mut guard = self.timelines_creating.lock().unwrap(); + if !guard.insert(timeline_id) { + // We should never try and load the same timeline twice during startup + unreachable!("Timeline {tenant_id}/{timeline_id} is already being created") } + let timeline_create_guard = TimelineCreateGuard { + _tenant_gate_guard: self.gate.enter()?, + owning_tenant: self.clone(), + timeline_id, + idempotency, + // The users of this specific return value don't need the timline_path in there. + timeline_path: timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id), + }; + Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard: timeline_create_guard, + }, + )) } - }; + Some(_) | None => { + { + let mut timelines_accessor = self.timelines.lock().unwrap(); + match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup + Entry::Occupied(_) => { + unreachable!( + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); + } + Entry::Vacant(v) => { + v.insert(Arc::clone(&timeline)); + timeline.maybe_spawn_flush_loop(); + } + } + } - // Sanity check: a timeline should have some content. - anyhow::ensure!( - ancestor.is_some() - || timeline - .layers - .read() - .await - .layer_map() - .expect("currently loading, layer manager cannot be shutdown already") - .iter_historic_layers() - .next() - .is_some(), - "Timeline has no ancestor and no layer files" - ); + // Sanity check: a timeline should have some content. + anyhow::ensure!( + ancestor.is_some() + || timeline + .layers + .read() + .await + .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") + .iter_historic_layers() + .next() + .is_some(), + "Timeline has no ancestor and no layer files" + ); - Ok(()) + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { + create_guard, + activate, + } => { + // TODO: see the comment in the task code above how I'm not so certain + // it is safe to activate here because of concurrent shutdowns. + match activate { + ActivateTimelineArgs::Yes { broker_client } => { + info!("activating timeline after reload from pgdata import task"); + timeline.activate(self.clone(), broker_client, None, ctx); + } + ActivateTimelineArgs::No => (), + } + drop(create_guard); + } + } + + Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + } + } } /// Attach a tenant that's available in cloud storage. @@ -1393,7 +1492,7 @@ impl Tenant { // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); - let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( + let (mut remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( remote_storage, self.tenant_shard_id, cancel.clone(), @@ -1427,11 +1526,27 @@ impl Tenant { warn!("Unexpected non timeline key {k}"); } + // Avoid downloading IndexPart of offloaded timelines. + let mut offloaded_with_prefix = HashSet::new(); + for offloaded in tenant_manifest.offloaded_timelines.iter() { + if remote_timeline_ids.remove(&offloaded.timeline_id) { + offloaded_with_prefix.insert(offloaded.timeline_id); + } else { + // We'll take care later of timelines in the manifest without a prefix + } + } + + let timelines = self + .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) + .await?; + Ok(TenantPreload { tenant_manifest, - timelines: self - .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: timelines + .into_iter() + .map(|(id, tl)| (id, Some(tl))) + .chain(offloaded_with_prefix.into_iter().map(|id| (id, None))) + .collect(), }) } @@ -1462,6 +1577,19 @@ impl Tenant { offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); offloaded_timeline_ids.insert(timeline_id); } + // Complete deletions for offloaded timeline id's from manifest. + // The manifest will be uploaded later in this function. + offloaded_timelines_list + .retain(|(offloaded_id, offloaded)| { + // Existence of a timeline is finally determined by the existence of an index-part.json in remote storage. + // If there is dangling references in another location, they need to be cleaned up. + let delete = !preload.timelines.contains_key(offloaded_id); + if delete { + tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); + offloaded.defuse_for_tenant_drop(); + } + !delete + }); let mut timelines_to_resume_deletions = vec![]; @@ -1469,10 +1597,9 @@ impl Tenant { let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { - if offloaded_timeline_ids.remove(&timeline_id) { - // The timeline is offloaded, skip loading it. - continue; - } + let Some(preload) = preload else { continue }; + // This is an invariant of the `preload` function's API + assert!(!offloaded_timeline_ids.contains(&timeline_id)); let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); @@ -1544,24 +1671,46 @@ impl Tenant { } // TODO again handle early failure - self.load_remote_timeline( - timeline_id, - index_part, - remote_metadata, - TimelineResources { - remote_client, - timeline_get_throttle: self.timeline_get_throttle.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, - ctx, - ) - .await - .with_context(|| { - format!( - "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_shard_id + let effect = self + .load_remote_timeline( + timeline_id, + index_part, + remote_metadata, + TimelineResources { + remote_client, + timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), + }, + LoadTimelineCause::Attach, + ctx, ) - })?; + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_shard_id + ) + })?; + + match effect { + TimelineInitAndSyncResult::ReadyToActivate(_) => { + // activation happens later, on Tenant::activate + } + TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard, + }, + ) => { + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline, + import_pgdata, + ActivateTimelineArgs::No, + guard, + )); + } + } } // Walk through deleted timelines, resume deletion @@ -1582,31 +1731,13 @@ impl Tenant { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - // Complete deletions for offloaded timeline id's. - offloaded_timelines_list - .retain(|(offloaded_id, offloaded)| { - // At this point, offloaded_timeline_ids has the list of all offloaded timelines - // without a prefix in S3, so they are inexistent. - // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage. - // If there is a dangling reference in another location, they need to be cleaned up. - let delete = offloaded_timeline_ids.contains(offloaded_id); - if delete { - tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); - offloaded.defuse_for_tenant_drop(); - } - !delete - }); - if !offloaded_timelines_list.is_empty() { - tracing::info!( - "Tenant has {} offloaded timelines", - offloaded_timelines_list.len() - ); - } + let needs_manifest_upload = + offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len(); { let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); } - if !offloaded_timeline_ids.is_empty() { + if needs_manifest_upload { self.store_tenant_manifest().await?; } @@ -1703,13 +1834,14 @@ impl Tenant { #[instrument(skip_all, fields(timeline_id=%timeline_id))] async fn load_remote_timeline( - &self, + self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, resources: TimelineResources, + cause: LoadTimelineCause, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); @@ -1736,6 +1868,7 @@ impl Tenant { index_part, remote_metadata, ancestor, + cause, ctx, ) .await @@ -1792,6 +1925,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } @@ -1921,6 +2055,7 @@ impl Tenant { TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists")) } TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e), + TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled, })?; let timeline_preload = self @@ -1959,6 +2094,7 @@ impl Tenant { index_part, remote_metadata, timeline_resources, + LoadTimelineCause::Unoffload, &ctx, ) .await @@ -2196,7 +2332,7 @@ impl Tenant { /// /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys. pub(crate) async fn create_empty_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2246,7 +2382,7 @@ impl Tenant { // Our current tests don't need the background loops. #[cfg(test)] pub async fn create_test_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2285,7 +2421,7 @@ impl Tenant { #[cfg(test)] #[allow(clippy::too_many_arguments)] pub async fn create_test_timeline_with_layers( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2422,6 +2558,16 @@ impl Tenant { self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) .await? } + CreateTimelineParams::ImportPgdata(params) => { + self.create_timeline_import_pgdata( + params, + ActivateTimelineArgs::Yes { + broker_client: broker_client.clone(), + }, + ctx, + ) + .await? + } }; // At this point we have dropped our guard on [`Self::timelines_creating`], and @@ -2442,6 +2588,12 @@ impl Tenant { .remote_client .wait_completion() .await + .map_err(|e| match e { + WaitCompletionError::NotInitialized( + e, // If the queue is already stopped, it's a shutdown error. + ) if e.is_stopping() => CreateTimelineError::ShuttingDown, + e => CreateTimelineError::Other(e.into()), + }) .context("wait for timeline initial uploads to complete")?; // The creating task is responsible for activating the timeline. @@ -2458,11 +2610,202 @@ impl Tenant { ); timeline } + CreateTimelineResult::ImportSpawned(timeline) => { + info!("import task spawned, timeline will become visible and activated once the import is done"); + timeline + } }; Ok(activated_timeline) } + /// The returned [`Arc`] is NOT in the [`Tenant::timelines`] map until the import + /// completes in the background. A DIFFERENT [`Arc`] will be inserted into the + /// [`Tenant::timelines`] map when the import completes. + /// We only return an [`Arc`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`] + /// for the response. + async fn create_timeline_import_pgdata( + self: &Arc, + params: CreateTimelineParamsImportPgdata, + activate: ActivateTimelineArgs, + ctx: &RequestContext, + ) -> Result { + let CreateTimelineParamsImportPgdata { + new_timeline_id, + location, + idempotency_key, + } = params; + + let started_at = chrono::Utc::now().naive_utc(); + + // + // There's probably a simpler way to upload an index part, but, remote_timeline_client + // is the canonical way we do it. + // - create an empty timeline in-memory + // - use its remote_timeline_client to do the upload + // - dispose of the uninit timeline + // - keep the creation guard alive + + let timeline_create_guard = match self + .start_creating_timeline( + new_timeline_id, + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: idempotency_key.clone(), + }), + ) + .await? + { + StartCreatingTimelineResult::CreateGuard(guard) => guard, + StartCreatingTimelineResult::Idempotent(timeline) => { + return Ok(CreateTimelineResult::Idempotent(timeline)) + } + }; + + let mut uninit_timeline = { + let this = &self; + let initdb_lsn = Lsn(0); + let _ctx = ctx; + async move { + let new_metadata = TimelineMetadata::new( + // Initialize disk_consistent LSN to 0, The caller must import some data to + // make it valid, before calling finish_creation() + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + 15, + ); + this.prepare_new_timeline( + new_timeline_id, + &new_metadata, + timeline_create_guard, + initdb_lsn, + None, + ) + .await + } + } + .await?; + + let in_progress = import_pgdata::index_part_format::InProgress { + idempotency_key, + location, + started_at, + }; + let index_part = import_pgdata::index_part_format::Root::V1( + import_pgdata::index_part_format::V1::InProgress(in_progress), + ); + uninit_timeline + .raw_timeline() + .unwrap() + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?; + + // wait_completion happens in caller + + let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); + + tokio::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + index_part, + activate, + timeline_create_guard, + )); + + // NB: the timeline doesn't exist in self.timelines at this point + Ok(CreateTimelineResult::ImportSpawned(timeline)) + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] + async fn create_timeline_import_pgdata_task( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + info!("starting"); + scopeguard::defer! {info!("exiting")}; + + let res = self + .create_timeline_import_pgdata_task_impl( + timeline, + index_part, + activate, + timeline_create_guard, + ) + .await; + if let Err(err) = &res { + error!(?err, "task failed"); + // TODO sleep & retry, sensitive to tenant shutdown + // TODO: allow timeline deletion requests => should cancel the task + } + } + + async fn create_timeline_import_pgdata_task_impl( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) -> Result<(), anyhow::Error> { + let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn); + + info!("importing pgdata"); + import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) + .await + .context("import")?; + info!("import done"); + + // + // Reload timeline from remote. + // This proves that the remote state is attachable, and it reuses the code. + // + // TODO: think about whether this is safe to do with concurrent Tenant::shutdown. + // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. + // But our activate() call might launch new background tasks after Tenant::shutdown + // already went past shutting down the Tenant::timelines, which this timeline here is no part of. + // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting + // down while bootstrapping/branching + activating), but, the race condition is much more likely + // to manifest because of the long runtime of this import task. + + // in theory this shouldn't even .await anything except for coop yield + info!("shutting down timeline"); + timeline.shutdown(ShutdownMode::Hard).await; + info!("timeline shut down, reloading from remote"); + // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc + // let Some(timeline) = Arc::into_inner(timeline) else { + // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); + // }; + let timeline_id = timeline.timeline_id; + + // load from object storage like Tenant::attach does + let resources = self.build_timeline_resources(timeline_id); + let index_part = resources + .remote_client + .download_index_file(&self.cancel) + .await?; + let index_part = match index_part { + MaybeDeletedIndexPart::Deleted(_) => { + // likely concurrent delete call, cplane should prevent this + anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + let metadata = index_part.metadata.clone(); + self + .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + create_guard: timeline_create_guard, activate, }, &ctx) + .await? + .ready_to_activate() + .context("implementation error: reloaded timeline still needs import after import reported success")?; + + anyhow::Ok(()) + } + pub(crate) async fn delete_timeline( self: Arc, timeline_id: TimelineId, @@ -2506,6 +2849,10 @@ impl Tenant { { let conf = self.tenant_conf.load(); + // If we may not delete layers, then simply skip GC. Even though a tenant + // in AttachedMulti state could do GC and just enqueue the blocked deletions, + // the only advantage to doing it is to perhaps shrink the LayerMap metadata + // a bit sooner than we would achieve by waiting for AttachedSingle status. if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); @@ -2547,7 +2894,14 @@ impl Tenant { { let conf = self.tenant_conf.load(); - if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { + + // Note that compaction usually requires deletions, but we don't respect + // may_delete_layers_hint here: that is because tenants in AttachedMulti + // should proceed with compaction even if they can't do deletion, to avoid + // accumulating dangerously deep stacks of L0 layers. Deletions will be + // enqueued inside RemoteTimelineClient, and executed layer if/when we transition + // to AttachedSingle state. + if !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(false); } @@ -3303,6 +3657,13 @@ where Ok(result) } +enum ActivateTimelineArgs { + Yes { + broker_client: storage_broker::BrokerClientChannel, + }, + No, +} + impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() @@ -3425,6 +3786,7 @@ impl Tenant { // this race is not possible if both request types come from the storage // controller (as they should!) because an exclusive op lock is required // on the storage controller side. + self.tenant_conf.rcu(|inner| { Arc::new(AttachedTenantConf { tenant_conf: new_tenant_conf.clone(), @@ -3434,20 +3796,22 @@ impl Tenant { }) }); + let updated = self.tenant_conf.load().clone(); + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&updated); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { let new_tenant_conf = new_conf.tenant_conf.clone(); - self.tenant_conf.store(Arc::new(new_conf)); + self.tenant_conf.store(Arc::new(new_conf.clone())); self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. @@ -3455,7 +3819,7 @@ impl Tenant { // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&new_conf); } } @@ -3483,6 +3847,7 @@ impl Tenant { /// `validate_ancestor == false` is used when a timeline is created for deletion /// and we might not have the ancestor present anymore which is fine for to be /// deleted timelines. + #[allow(clippy::too_many_arguments)] fn create_timeline_struct( &self, new_timeline_id: TimelineId, @@ -4246,16 +4611,17 @@ impl Tenant { /// If the timeline was already created in the meantime, we check whether this /// request conflicts or is idempotent , based on `state`. async fn start_creating_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, - ) -> Result, CreateTimelineError> { + ) -> Result { let allow_offloaded = false; match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) { Ok(create_guard) => { pausable_failpoint!("timeline-creation-after-uninit"); Ok(StartCreatingTimelineResult::CreateGuard(create_guard)) } + Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown), Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot // check if this request matches the existing one, so caller must try @@ -4523,6 +4889,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } @@ -4544,7 +4911,7 @@ impl Tenant { &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - create_guard: TimelineCreateGuard<'a>, + create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result> { @@ -4604,7 +4971,7 @@ impl Tenant { /// The `allow_offloaded` parameter controls whether to tolerate the existence of /// offloaded timelines or not. fn create_timeline_create_guard( - &self, + self: &Arc, timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, @@ -4864,48 +5231,16 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; - let mut initdb_command = tokio::process::Command::new(&initdb_bin_path); - initdb_command - .args(["--pgdata", initdb_target_dir.as_ref()]) - .args(["--username", &conf.superuser]) - .args(["--encoding", "utf8"]) - .args(["--locale", &conf.locale]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &initdb_lib_dir) - .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdin(std::process::Stdio::null()) - // stdout invocation produces the same output every time, we don't need it - .stdout(std::process::Stdio::null()) - // we would be interested in the stderr output, if there was any - .stderr(std::process::Stdio::piped()); - - // Before version 14, only the libc provide was available. - if pg_version > 14 { - // Version 17 brought with it a builtin locale provider which only provides - // C and C.UTF-8. While being safer for collation purposes since it is - // guaranteed to be consistent throughout a major release, it is also more - // performant. - let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; - - initdb_command.args(["--locale-provider", locale_provider]); - } - - let initdb_proc = initdb_command.spawn()?; - - // Ideally we'd select here with the cancellation token, but the problem is that - // we can't safely terminate initdb: it launches processes of its own, and killing - // initdb doesn't kill them. After we return from this function, we want the target - // directory to be able to be cleaned up. - // See https://github.com/neondatabase/neon/issues/6385 - let initdb_output = initdb_proc.wait_with_output().await?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed( - initdb_output.status, - initdb_output.stderr, - )); - } + let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: &conf.superuser, + locale: &conf.locale, + initdb_bin: &initdb_bin_path, + pg_version, + library_search_path: &initdb_lib_dir, + pgdata: initdb_target_dir, + }) + .await + .map_err(InitdbError::Inner); // This isn't true cancellation support, see above. Still return an error to // excercise the cancellation code path. @@ -4913,7 +5248,7 @@ async fn run_initdb( return Err(InitdbError::Cancelled); } - Ok(()) + res } /// Dump contents of a layer file to stdout. @@ -5244,7 +5579,7 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; - use timeline::DeltaLayerTestDesc; + use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; #[cfg(feature = "testing")] @@ -7718,7 +8053,7 @@ mod tests { let cancel = CancellationToken::new(); tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); @@ -7795,7 +8130,7 @@ mod tests { guard.cutoffs.space = Lsn(0x40); } tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); @@ -8227,7 +8562,7 @@ mod tests { let cancel = CancellationToken::new(); tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); @@ -8256,7 +8591,7 @@ mod tests { guard.cutoffs.space = Lsn(0x40); } tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); @@ -8809,7 +9144,14 @@ mod tests { dryrun_flags.insert(CompactFlags::DryRun); tline - .compact_with_gc(&cancel, dryrun_flags, &ctx) + .compact_with_gc( + &cancel, + CompactOptions { + flags: dryrun_flags, + compact_range: None, + }, + &ctx, + ) .await .unwrap(); // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs @@ -8817,14 +9159,14 @@ mod tests { verify_result().await; tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; // compact again tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; @@ -8837,14 +9179,14 @@ mod tests { guard.cutoffs.space = Lsn(0x38); } tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result // not increasing the GC horizon and compact again tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; @@ -9038,7 +9380,14 @@ mod tests { dryrun_flags.insert(CompactFlags::DryRun); tline - .compact_with_gc(&cancel, dryrun_flags, &ctx) + .compact_with_gc( + &cancel, + CompactOptions { + flags: dryrun_flags, + compact_range: None, + }, + &ctx, + ) .await .unwrap(); // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs @@ -9046,14 +9395,14 @@ mod tests { verify_result().await; tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; // compact again tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); verify_result().await; @@ -9238,7 +9587,7 @@ mod tests { let cancel = CancellationToken::new(); branch_tline - .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) .await .unwrap(); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4fc9d740c8..92b2200542 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1719,10 +1719,11 @@ impl TenantManager { parent_layers.push(relative_path.to_owned()); } } - debug_assert!( - !parent_layers.is_empty(), - "shutdown cannot empty the layermap" - ); + + if parent_layers.is_empty() { + tracing::info!("Ancestor shard has no resident layer to hard link"); + } + (parent_timelines, parent_layers) }; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 94f42c7827..007bd3eef0 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -197,8 +197,9 @@ use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::pausable_failpoint; +use utils::shard::ShardNumber; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; @@ -222,7 +223,7 @@ use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; +use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -240,8 +241,10 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; +use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; +use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; @@ -301,6 +304,36 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; +/// Behavioral modes that enable seamless live migration. +/// +/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. +struct RemoteTimelineClientConfig { + /// If this is false, then update to remote_consistent_lsn are dropped rather + /// than being submitted to DeletionQueue for validation. This behavior is + /// used when a tenant attachment is known to have a stale generation number, + /// such that validation attempts will always fail. This is not necessary + /// for correctness, but avoids spamming error statistics with failed validations + /// when doing migrations of tenants. + process_remote_consistent_lsn_updates: bool, + + /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient + /// rather than being submitted to the DeletionQueue. This behavior is used when a tenant + /// is known to be multi-attached, in order to avoid disrupting other attached tenants + /// whose generations' metadata refers to the deleted objects. + block_deletions: bool, +} + +/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do +/// not carry the entire LocationConf structure: it's much more than we need. The From +/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient. +impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig { + fn from(lc: &AttachedLocationConfig) -> Self { + Self { + block_deletions: !lc.may_delete_layers_hint(), + process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(), + } + } +} /// A client for accessing a timeline's data in remote storage. /// @@ -321,7 +354,7 @@ pub struct UploadQueueNotReadyError; /// in the index part file, whenever timeline metadata is uploaded. /// /// Downloads are not queued, they are performed immediately. -pub struct RemoteTimelineClient { +pub(crate) struct RemoteTimelineClient { conf: &'static PageServerConf, runtime: tokio::runtime::Handle, @@ -338,6 +371,9 @@ pub struct RemoteTimelineClient { deletion_queue_client: DeletionQueueClient, + /// Subset of tenant configuration used to control upload behaviors during migrations + config: std::sync::RwLock, + cancel: CancellationToken, } @@ -348,13 +384,14 @@ impl RemoteTimelineClient { /// Note: the caller must initialize the upload queue before any uploads can be scheduled, /// by calling init_upload_queue. /// - pub fn new( + pub(crate) fn new( remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, + location_conf: &AttachedLocationConfig, ) -> RemoteTimelineClient { RemoteTimelineClient { conf, @@ -374,6 +411,7 @@ impl RemoteTimelineClient { &tenant_shard_id, &timeline_id, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)), cancel: CancellationToken::new(), } } @@ -429,6 +467,43 @@ impl RemoteTimelineClient { Ok(()) } + /// Notify this client of a change to its parent tenant's config, as this may cause us to + /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle) + pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) { + let new_conf = RemoteTimelineClientConfig::from(location_conf); + let unblocked = !new_conf.block_deletions; + + // Update config before draining deletions, so that we don't race with more being + // inserted. This can result in deletions happening our of order, but that does not + // violate any invariants: deletions only need to be ordered relative to upload of the index + // that dereferences the deleted objects, and we are not changing that order. + *self.config.write().unwrap() = new_conf; + + if unblocked { + // If we may now delete layers, drain any that were blocked in our old + // configuration state + let mut queue_locked = self.upload_queue.lock().unwrap(); + + if let Ok(queue) = queue_locked.initialized_mut() { + let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); + for d in blocked_deletions { + if let Err(e) = self.deletion_queue_client.push_layers_sync( + self.tenant_shard_id, + self.timeline_id, + self.generation, + d.layers, + ) { + // This could happen if the pageserver is shut down while a tenant + // is transitioning from a deletion-blocked state: we will leak some + // S3 objects in this case. + warn!("Failed to drain blocked deletions: {}", e); + break; + } + } + } + } + } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { @@ -739,6 +814,18 @@ impl RemoteTimelineClient { Ok(need_wait) } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( + self: &Arc, + state: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.import_pgdata = state; + self.schedule_index_upload(upload_queue)?; + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1016,7 +1103,7 @@ impl RemoteTimelineClient { "scheduled layer file upload {layer}", ); - let op = UploadOp::UploadLayer(layer, metadata); + let op = UploadOp::UploadLayer(layer, metadata, None); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -1731,7 +1818,7 @@ impl RemoteTimelineClient { // have finished. upload_queue.inprogress_tasks.is_empty() } - UploadOp::Delete(_) => { + UploadOp::Delete(..) => { // Wait for preceding uploads to finish. Concurrent deletions are OK, though. upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() } @@ -1759,19 +1846,32 @@ impl RemoteTimelineClient { } // We can launch this task. Remove it from the queue first. - let next_op = upload_queue.queued_operations.pop_front().unwrap(); + let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); debug!("starting op: {}", next_op); - // Update the counters - match next_op { - UploadOp::UploadLayer(_, _) => { + // Update the counters and prepare + match &mut next_op { + UploadOp::UploadLayer(layer, meta, mode) => { + if upload_queue + .recently_deleted + .remove(&(layer.layer_desc().layer_name().clone(), meta.generation)) + { + *mode = Some(OpType::FlushDeletion); + } else { + *mode = Some(OpType::MayReorder) + } upload_queue.num_inprogress_layer_uploads += 1; } UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } - UploadOp::Delete(_) => { + UploadOp::Delete(Delete { layers }) => { + for (name, meta) in layers { + upload_queue + .recently_deleted + .insert((name.clone(), meta.generation)); + } upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { @@ -1847,7 +1947,66 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata) => { + UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + if let Some(OpType::FlushDeletion) = mode { + if self.config.read().unwrap().block_deletions { + // Of course, this is not efficient... but usually the queue should be empty. + let mut queue_locked = self.upload_queue.lock().unwrap(); + let mut detected = false; + if let Ok(queue) = queue_locked.initialized_mut() { + for list in queue.blocked_deletions.iter_mut() { + list.layers.retain(|(name, meta)| { + if name == &layer.layer_desc().layer_name() + && meta.generation == layer_metadata.generation + { + detected = true; + // remove the layer from deletion queue + false + } else { + // keep the layer + true + } + }); + } + } + if detected { + info!( + "cancelled blocked deletion of layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } else { + // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions + // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted, + // which is not possible in the current system. + info!( + "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + { + // We are going to flush, we can clean up the recently deleted list. + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.recently_deleted.clear(); + } + } + if let Err(e) = self.deletion_queue_client.flush_execute().await { + warn!( + "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } else { + info!( + "done flushing deletion queue before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } + } let local_path = layer.local_path(); // We should only be uploading layers created by this `Tenant`'s lifetime, so @@ -1912,16 +2071,24 @@ impl RemoteTimelineClient { res } UploadOp::Delete(delete) => { - pausable_failpoint!("before-delete-layer-pausable"); - self.deletion_queue_client - .push_layers( - self.tenant_shard_id, - self.timeline_id, - self.generation, - delete.layers.clone(), - ) - .await - .map_err(|e| anyhow::anyhow!(e)) + if self.config.read().unwrap().block_deletions { + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.blocked_deletions.push(delete.clone()); + } + Ok(()) + } else { + pausable_failpoint!("before-delete-layer-pausable"); + self.deletion_queue_client + .push_layers( + self.tenant_shard_id, + self.timeline_id, + self.generation, + delete.layers.clone(), + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + } } unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => { // unreachable. Barrier operations are handled synchronously in @@ -2003,7 +2170,7 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(_, _, _) => { upload_queue.num_inprogress_layer_uploads -= 1; None } @@ -2028,8 +2195,16 @@ impl RemoteTimelineClient { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); None - } else { + } else if self + .config + .read() + .unwrap() + .process_remote_consistent_lsn_updates + { Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) + } else { + // Our config disables remote_consistent_lsn updates: drop it. + None } } UploadOp::Delete(_) => { @@ -2072,7 +2247,7 @@ impl RemoteTimelineClient { )> { use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, m) => ( + UploadOp::UploadLayer(_, m, _) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), @@ -2166,8 +2341,10 @@ impl RemoteTimelineClient { queued_operations: VecDeque::default(), #[cfg(feature = "testing")] dangling_files: HashMap::default(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + recently_deleted: HashSet::new(), }; let upload_queue = std::mem::replace( @@ -2231,6 +2408,28 @@ impl RemoteTimelineClient { UploadQueue::Initialized(x) => x.no_pending_work(), } } + + /// 'foreign' in the sense that it does not belong to this tenant shard. This method + /// is used during GC for other shards to get the index of shard zero. + pub(crate) async fn download_foreign_index( + &self, + shard_number: ShardNumber, + cancel: &CancellationToken, + ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> { + let foreign_shard_id = TenantShardId { + shard_number, + shard_count: self.tenant_shard_id.shard_count, + tenant_id: self.tenant_shard_id.tenant_id, + }; + download_index_part( + &self.storage_impl, + &foreign_shard_id, + &self.timeline_id, + Generation::MAX, + cancel, + ) + .await + } } pub(crate) struct UploadQueueAccessor<'a> { @@ -2379,6 +2578,7 @@ mod tests { use crate::{ context::RequestContext, tenant::{ + config::AttachmentMode, harness::{TenantHarness, TIMELINE_ID}, storage_layer::layer::local_layer_path, Tenant, Timeline, @@ -2464,6 +2664,10 @@ mod tests { /// Construct a RemoteTimelineClient in an arbitrary generation fn build_client(&self, generation: Generation) -> Arc { + let location_conf = AttachedLocationConfig { + generation, + attach_mode: AttachmentMode::Single, + }; Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), @@ -2477,6 +2681,7 @@ mod tests { &self.harness.tenant_shard_id, &TIMELINE_ID, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)), cancel: CancellationToken::new(), }) } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index efcd20d1bf..d632e595ad 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -706,7 +706,7 @@ where .and_then(|x| x) } -async fn download_retry_forever( +pub(crate) async fn download_retry_forever( op: O, description: &str, cancel: &CancellationToken, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index d8a881a2c4..506990fb2f 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -12,6 +12,7 @@ use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -37,6 +38,13 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub archived_at: Option, + /// This field supports import-from-pgdata ("fast imports" platform feature). + /// We don't currently use fast imports, so, this field is None for all production timelines. + /// See for more information. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub import_pgdata: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -90,10 +98,11 @@ impl IndexPart { /// - 7: metadata_bytes is no longer written, but still read /// - 8: added `archived_at` /// - 9: +gc_blocking - const LATEST_VERSION: usize = 9; + /// - 10: +import_pgdata + const LATEST_VERSION: usize = 10; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -108,6 +117,7 @@ impl IndexPart { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, } } @@ -381,6 +391,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -425,6 +436,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -470,6 +482,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -518,6 +531,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -561,6 +575,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -607,6 +622,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -658,6 +674,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -714,6 +731,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -771,6 +789,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -833,6 +852,83 @@ mod tests { }), last_aux_file_policy: Default::default(), archived_at: None, + import_pgdata: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v10_importpgdata_is_parsed() { + let example = r#"{ + "version": 10, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + } + }"#; + + let expected = IndexPart { + version: 10, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))) }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 1331c07d05..3df89a928c 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -111,15 +111,6 @@ pub(crate) struct SecondaryTenant { pub(super) heatmap_total_size_metric: UIntGauge, } -impl Drop for SecondaryTenant { - fn drop(&mut self) { - let tenant_id = self.tenant_shard_id.tenant_id.to_string(); - let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); - let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - } -} - impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, @@ -167,6 +158,13 @@ impl SecondaryTenant { // Wait for any secondary downloader work to complete self.gate.close().await; + + self.validate_metrics(); + + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { @@ -254,6 +252,20 @@ impl SecondaryTenant { .await .expect("secondary eviction should not have panicked"); } + + /// Exhaustive check that incrementally updated metrics match the actual state. + #[cfg(feature = "testing")] + fn validate_metrics(&self) { + let detail = self.detail.lock().unwrap(); + let resident_size = detail.total_resident_size(); + + assert_eq!(resident_size, self.resident_size_metric.get()); + } + + #[cfg(not(feature = "testing"))] + fn validate_metrics(&self) { + // No-op in non-testing builds + } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 82c5702686..7443261a9c 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -242,6 +242,19 @@ impl SecondaryDetail { } } + #[cfg(feature = "testing")] + pub(crate) fn total_resident_size(&self) -> u64 { + self.timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::() + } + pub(super) fn evict_layer( &mut self, name: LayerName, @@ -763,24 +776,7 @@ impl<'a> TenantDownloader<'a> { } // Metrics consistency check in testing builds - if cfg!(feature = "testing") { - let detail = self.secondary_state.detail.lock().unwrap(); - let resident_size = detail - .timelines - .values() - .map(|tl| { - tl.on_disk_layers - .values() - .map(|v| v.metadata.file_size) - .sum::() - }) - .sum::(); - assert_eq!( - resident_size, - self.secondary_state.resident_size_metric.get() - ); - } - + self.secondary_state.validate_metrics(); // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2bc14ec317..4881be33a6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +pub(crate) mod import_pgdata; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -38,6 +39,7 @@ use pageserver_api::{ shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; +use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; use tokio::{ @@ -272,7 +274,7 @@ pub struct Timeline { /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Arc, + pub(crate) remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -481,17 +483,27 @@ impl GcInfo { &mut self, child_id: TimelineId, maybe_offloaded: MaybeOffloaded, - ) { - self.retain_lsns - .retain(|i| !(i.1 == child_id && i.2 == maybe_offloaded)); + ) -> bool { + // Remove at most one element. Needed for correctness if there is two live `Timeline` objects referencing + // the same timeline. Shouldn't but maybe can occur when Arc's live longer than intended. + let mut removed = false; + self.retain_lsns.retain(|i| { + if removed { + return true; + } + let remove = i.1 == child_id && i.2 == maybe_offloaded; + removed |= remove; + !remove + }); + removed } - pub(super) fn remove_child_not_offloaded(&mut self, child_id: TimelineId) { - self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::No); + pub(super) fn remove_child_not_offloaded(&mut self, child_id: TimelineId) -> bool { + self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::No) } - pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) { - self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes); + pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool { + self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes) } } @@ -764,6 +776,21 @@ pub(crate) enum CompactFlags { DryRun, } +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize)] +pub(crate) struct CompactRange { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub start: Key, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub end: Key, +} + +#[derive(Clone, Default)] +pub(crate) struct CompactOptions { + pub flags: EnumSet, + pub compact_range: Option, +} + impl std::fmt::Debug for Timeline { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "Timeline<{}>", self.timeline_id) @@ -1602,6 +1629,25 @@ impl Timeline { cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, + ) -> Result { + self.compact_with_options( + cancel, + CompactOptions { + flags, + compact_range: None, + }, + ctx, + ) + .await + } + + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending + /// compaction tasks. + pub(crate) async fn compact_with_options( + self: &Arc, + cancel: &CancellationToken, + options: CompactOptions, + ctx: &RequestContext, ) -> Result { // most likely the cancellation token is from background task, but in tests it could be the // request task as well. @@ -1639,7 +1685,7 @@ impl Timeline { self.compact_tiered(cancel, ctx).await?; Ok(false) } - CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, + CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await, } } @@ -2040,6 +2086,11 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } + pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf.is_gc_blocked_by_lsn_lease_deadline() + } + pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2127,14 +2178,14 @@ impl Timeline { ) } - pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - new_conf, + &new_conf.tenant_conf, &self.conf.default_tenant_conf, ); @@ -2142,6 +2193,9 @@ impl Timeline { let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); + + self.remote_client.update_config(&new_conf.location); + self.metrics .evictions_with_low_residence_duration .write() @@ -2599,6 +2653,7 @@ impl Timeline { // // NB: generation numbers naturally protect against this because they disambiguate // (1) and (4) + // TODO: this is basically a no-op now, should we remove it? self.remote_client.schedule_barrier()?; // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. @@ -2654,20 +2709,23 @@ impl Timeline { { Some(cancel) => cancel.cancel(), None => { - let state = self.current_state(); - if matches!( - state, - TimelineState::Broken { .. } | TimelineState::Stopping - ) { - - // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). - // Don't make noise. - } else { - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); - debug_assert!(false); + match self.current_state() { + TimelineState::Broken { .. } | TimelineState::Stopping => { + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + // Don't make noise. + } + TimelineState::Loading => { + // Import does not return an activated timeline. + info!("discarding priority boost for logical size calculation because timeline is not yet active"); + } + TimelineState::Active => { + // activation should be setting the once cell + warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); + } } } - }; + } } } @@ -4514,7 +4572,10 @@ impl Drop for Timeline { // This lock should never be poisoned, but in case it is we do a .map() instead of // an unwrap(), to avoid panicking in a destructor and thereby aborting the process. if let Ok(mut gc_info) = ancestor.gc_info.write() { - gc_info.remove_child_not_offloaded(self.timeline_id) + if !gc_info.remove_child_not_offloaded(self.timeline_id) { + tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, + "Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed"); + } } } } @@ -4774,6 +4835,86 @@ impl Timeline { Ok(()) } + async fn find_gc_time_cutoff( + &self, + pitr: Duration, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + if self.shard_identity.is_shard_zero() { + // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } + }; + Ok(time_cutoff) + } else { + // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff + // from shard zero's index. The index doesn't explicitly tell us the time cutoff, but we may assume that + // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a + // space cutoff that we would also have respected ourselves. + match self + .remote_client + .download_foreign_index(ShardNumber(0), cancel) + .await + { + Ok((index_part, index_generation, _index_mtime)) => { + tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn()); + Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) + } + Err(DownloadError::NotFound) => { + // This is unexpected, because during timeline creations shard zero persists to remote + // storage before other shards are called, and during timeline deletion non-zeroth shards are + // deleted before the zeroth one. However, it should be harmless: if we somehow end up in this + // state, then shard zero should _eventually_ write an index when it GCs. + tracing::warn!("GC couldn't find shard zero's index for timeline"); + Ok(None) + } + Err(e) => { + // TODO: this function should return a different error type than page reconstruct error + Err(PageReconstructError::Other(anyhow::anyhow!(e))) + } + } + + // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage + // controller. Otherwise, it is possible that we see the GC cutoff go backwards while shard zero + // is going through a migration if we read the old location's index and it has GC'd ahead of the + // new location. This is legal in principle, but problematic in practice because it might result + // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards + // because they have GC'd past the branch point. + } + } + /// Find the Lsns above which layer files need to be retained on /// garbage collection. /// @@ -4816,40 +4957,7 @@ impl Timeline { // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. - let time_cutoff = { - let now = SystemTime::now(); - let time_range = if pitr == Duration::ZERO { - humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") - } else { - pitr - }; - - // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) - let time_cutoff = now.checked_sub(time_range).unwrap_or(now); - let timestamp = to_pg_timestamp(time_cutoff); - - match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { - LsnForTimestamp::Present(lsn) => Some(lsn), - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - Some(self.get_last_record_lsn()) - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - None - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - None - } - } - }; + let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e6ef1aae2b..ecd68ba55e 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, + CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline, }; @@ -273,22 +273,32 @@ impl Timeline { pub(crate) async fn compact_legacy( self: &Arc, cancel: &CancellationToken, - flags: EnumSet, + options: CompactOptions, ctx: &RequestContext, ) -> Result { - if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { - self.compact_with_gc(cancel, flags, ctx) + if options + .flags + .contains(CompactFlags::EnhancedGcBottomMostCompaction) + { + self.compact_with_gc(cancel, options, ctx) .await .map_err(CompactionError::Other)?; return Ok(false); } - if flags.contains(CompactFlags::DryRun) { + if options.flags.contains(CompactFlags::DryRun) { return Err(CompactionError::Other(anyhow!( "dry-run mode is not supported for legacy compaction for now" ))); } + if options.compact_range.is_some() { + // maybe useful in the future? could implement this at some point + return Err(CompactionError::Other(anyhow!( + "compaction range is not supported for legacy compaction for now" + ))); + } + // High level strategy for compaction / image creation: // // 1. First, calculate the desired "partitioning" of the @@ -338,7 +348,7 @@ impl Timeline { .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), - flags, + options.flags, ctx, ) .await @@ -354,7 +364,7 @@ impl Timeline { let fully_compacted = self .compact_level0( target_file_size, - flags.contains(CompactFlags::ForceL0Compaction), + options.flags.contains(CompactFlags::ForceL0Compaction), ctx, ) .await?; @@ -372,7 +382,10 @@ impl Timeline { .create_image_layers( &partitioning, lsn, - if flags.contains(CompactFlags::ForceImageLayerCreation) { + if options + .flags + .contains(CompactFlags::ForceImageLayerCreation) + { ImageLayerCreationMode::Force } else { ImageLayerCreationMode::Try @@ -1736,11 +1749,19 @@ impl Timeline { pub(crate) async fn compact_with_gc( self: &Arc, cancel: &CancellationToken, - flags: EnumSet, + options: CompactOptions, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx) - .await + self.partial_compact_with_gc( + options + .compact_range + .map(|range| range.start..range.end) + .unwrap_or_else(|| Key::MIN..Key::MAX), + cancel, + options.flags, + ctx, + ) + .await } /// An experimental compaction building block that combines compaction with garbage collection. @@ -2021,6 +2042,14 @@ impl Timeline { if cancel.is_cancelled() { return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error } + if self.shard_identity.is_key_disposable(&key) { + // If this shard does not need to store this key, simply skip it. + // + // This is not handled in the filter iterator because shard is determined by hash. + // Therefore, it does not give us any performance benefit to do things like skip + // a whole layer file as handling key spaces (ranges). + continue; + } if !job_desc.compaction_key_range.contains(&key) { if !desc.is_delta { continue; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 13a8dfa51a..67fc710c44 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -283,7 +283,7 @@ impl DeleteTimelineFlow { /// Shortcut to create Timeline in stopping state and spawn deletion task. #[instrument(skip_all, fields(%timeline_id))] - pub async fn resume_deletion( + pub(crate) async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs new file mode 100644 index 0000000000..de56468580 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -0,0 +1,218 @@ +use std::sync::Arc; + +use anyhow::{bail, Context}; +use remote_storage::RemotePath; +use tokio_util::sync::CancellationToken; +use tracing::{info, info_span, Instrument}; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; + +use super::Timeline; + +mod flow; +mod importbucket_client; +mod importbucket_format; +pub(crate) mod index_part_format; +pub(crate) mod upcall_api; + +pub async fn doit( + timeline: &Arc, + index_part: index_part_format::Root, + ctx: &RequestContext, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let index_part_format::Root::V1(v1) = index_part; + let index_part_format::InProgress { + location, + idempotency_key, + started_at, + } = match v1 { + index_part_format::V1::Done(_) => return Ok(()), + index_part_format::V1::InProgress(in_progress) => in_progress, + }; + + let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + + info!("get spec early so we know we'll be able to upcall when done"); + let Some(spec) = storage.get_spec().await? else { + bail!("spec not found") + }; + + let upcall_client = + upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?; + + // + // send an early progress update to clean up k8s job early and generate potentially useful logs + // + info!("send early progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("early_progress_update")) + .await?; + + let status_prefix = RemotePath::from_string("status").unwrap(); + + // + // See if shard is done. + // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. + // + let shard_status_key = + status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); + let shard_status: Option = + storage.get_json(&shard_status_key).await?; + info!(?shard_status, "peeking shard status"); + if shard_status.map(|st| st.done).unwrap_or(false) { + info!("shard status indicates that the shard is done, skipping import"); + } else { + // TODO: checkpoint the progress into the IndexPart instead of restarting + // from the beginning. + + // + // Wipe the slate clean - the flow does not allow resuming. + // We can implement resuming in the future by checkpointing the progress into the IndexPart. + // + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefintely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + // + // Do the import + // + info!("do the import"); + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + let _ctx: &RequestContext = ctx; + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + flow::run( + timeline.clone(), + base_lsn, + control_file, + storage.clone(), + ctx, + ) + .await?; + + // + // Communicate that shard is done. + // + storage + .put_json( + &shard_status_key, + &importbucket_format::ShardStatus { done: true }, + ) + .await + .context("put shard status")?; + } + + // + // Ensure at-least-once deliver of the upcall to cplane + // before we mark the task as done and never come here again. + // + info!("send final progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("final_progress_update")) + .await?; + + // + // Mark as done in index_part. + // This makes subsequent timeline loads enter the normal load code path + // instead of spawning the import task and calling this here function. + // + info!("mark import as complete in index part"); + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( + index_part_format::V1::Done(index_part_format::Done { + idempotency_key, + started_at, + finished_at: chrono::Utc::now().naive_utc(), + }), + )))?; + + timeline.remote_client.wait_completion().await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs new file mode 100644 index 0000000000..cbd4168c06 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -0,0 +1,798 @@ +//! Import a PGDATA directory into an empty root timeline. +//! +//! This module is adapted hackathon code by Heikki and Stas. +//! Other code in the parent module was written by Christian as part of a customer PoC. +//! +//! The hackathon code was producing image layer files as a free-standing program. +//! +//! It has been modified to +//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard) +//! - => sharding-awareness: produce image layers with only the data relevant for this shard +//! - => S3 as the source for the PGDATA instead of local filesystem +//! +//! TODOs before productionization: +//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding. +//! => produced image layers likely too small. +//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size. +//! - asserts / unwraps need to be replaced with errors +//! - don't trust remote objects will be small (=prevent OOMs in those cases) +//! - limit all in-memory buffers in size, or download to disk and read from there +//! - limit task concurrency +//! - generally play nice with other tenants in the system +//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits +//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc +//! - integrate with layer eviction system +//! - audit for Tenant::cancel nor Timeline::cancel responsivity +//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!) +//! +//! An incomplete set of TODOs from the Hackathon: +//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) + +use std::sync::Arc; + +use anyhow::{bail, ensure}; +use bytes::Bytes; + +use itertools::Itertools; +use pageserver_api::{ + key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, + reltag::RelTag, + shard::ShardIdentity, +}; +use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; +use tokio::task::JoinSet; +use tracing::{debug, info_span, instrument, Instrument}; + +use crate::{ + assert_u64_eq_usize::UsizeIsU64, + pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, +}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + pgdatadir_mapping::{DbDirectory, RelDirectory}, + task_mgr::TaskKind, + tenant::storage_layer::{ImageLayerWriter, Layer}, +}; + +use pageserver_api::key::Key; +use pageserver_api::key::{ + slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, + TWOPHASEDIR_KEY, +}; +use pageserver_api::keyspace::singleton_range; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; +use pageserver_api::reltag::SlruKind; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +use std::collections::HashSet; +use std::ops::Range; + +use super::{ + importbucket_client::{ControlFile, RemoteStorageWrapper}, + Timeline, +}; + +use remote_storage::RemotePath; + +pub async fn run( + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + storage: RemoteStorageWrapper, + ctx: &RequestContext, +) -> anyhow::Result<()> { + Flow { + timeline, + pgdata_lsn, + control_file, + tasks: Vec::new(), + storage, + } + .run(ctx) + .await +} + +struct Flow { + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + tasks: Vec, + storage: RemoteStorageWrapper, +} + +impl Flow { + /// Perform the ingestion into [`Self::timeline`]. + /// Assumes the timeline is empty (= no layers). + pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); + + self.pgdata_lsn = pgdata_lsn; + + let datadir = PgDataDir::new(&self.storage).await?; + + // Import dbdir (00:00:00 keyspace) + // This is just constructed here, but will be written to the image layer in the first call to import_db() + let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory { + dbdirs: datadir + .dbs + .iter() + .map(|db| ((db.spcnode, db.dboid), true)) + .collect(), + })?); + self.tasks + .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into()); + + // Import databases (00:spcnode:dbnode keyspace for each db) + for db in datadir.dbs { + self.import_db(&db).await?; + } + + // Import SLRUs + + // pg_xact (01:00 keyspace) + self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + .await?; + // pg_multixact/members (01:01 keyspace) + self.import_slru( + SlruKind::MultiXactMembers, + &self.storage.pgdata().join("pg_multixact/members"), + ) + .await?; + // pg_multixact/offsets (01:02 keyspace) + self.import_slru( + SlruKind::MultiXactOffsets, + &self.storage.pgdata().join("pg_multixact/offsets"), + ) + .await?; + + // Import pg_twophase. + // TODO: as empty + let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + TWOPHASEDIR_KEY, + Bytes::from(twophasedir_buf), + ))); + + // Controlfile, checkpoint + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CONTROLFILE_KEY, + self.control_file.control_file_buf().clone(), + ))); + + let checkpoint_buf = self + .control_file + .control_file_data() + .checkPointCopy + .encode()?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CHECKPOINT_KEY, + checkpoint_buf, + ))); + + // Assigns parts of key space to later parallel jobs + let mut last_end_key = Key::MIN; + let mut current_chunk = Vec::new(); + let mut current_chunk_size: usize = 0; + let mut parallel_jobs = Vec::new(); + for task in std::mem::take(&mut self.tasks).into_iter() { + if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + let key_range = last_end_key..task.key_range().start; + parallel_jobs.push(ChunkProcessingJob::new( + key_range.clone(), + std::mem::take(&mut current_chunk), + &self, + )); + last_end_key = key_range.end; + current_chunk_size = 0; + } + current_chunk_size += task.total_size(); + current_chunk.push(task); + } + parallel_jobs.push(ChunkProcessingJob::new( + last_end_key..Key::MAX, + current_chunk, + &self, + )); + + // Start all jobs simultaneosly + let mut work = JoinSet::new(); + // TODO: semaphore? + for job in parallel_jobs { + let ctx: RequestContext = + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); + } + let mut results = Vec::new(); + while let Some(result) = work.join_next().await { + match result { + Ok(res) => { + results.push(res); + } + Err(_joinset_err) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] + async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> { + debug!("start"); + scopeguard::defer! { + debug!("return"); + } + + // Import relmap (00:spcnode:dbnode:00:*:00) + let relmap_key = relmap_file_key(db.spcnode, db.dboid); + debug!("Constructing relmap entry, key {relmap_key}"); + let relmap_path = db.path.join("pg_filenode.map"); + let relmap_buf = self.storage.get(&relmap_path).await?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + relmap_key, relmap_buf, + ))); + + // Import reldir (00:spcnode:dbnode:00:*:01) + let reldir_key = rel_dir_to_key(db.spcnode, db.dboid); + debug!("Constructing reldirs entry, key {reldir_key}"); + let reldir_buf = RelDirectory::ser(&RelDirectory { + rels: db + .files + .iter() + .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)) + .collect(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + reldir_key, + Bytes::from(reldir_buf), + ))); + + // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last + // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff) + for file in &db.files { + debug!(%file.path, %file.filesize, "importing file"); + let len = file.filesize; + ensure!(len % 8192 == 0); + let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192); + let start_key = rel_block_to_key(file.rel_tag, start_blk); + let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); + self.tasks + .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &file.path, + self.storage.clone(), + ))); + + // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff) + if let Some(nblocks) = file.nblocks { + let size_key = rel_size_to_key(file.rel_tag); + //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}"); + let buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + size_key, + Bytes::from(buf.to_vec()), + ))); + } + } + + Ok(()) + } + + async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { + let segments = self.storage.listfilesindir(path).await?; + let segments: Vec<(String, u32, usize)> = segments + .into_iter() + .filter_map(|(path, size)| { + let filename = path.object_name()?; + let segno = u32::from_str_radix(filename, 16).ok()?; + Some((filename.to_string(), segno, size)) + }) + .collect(); + + // Write SlruDir + let slrudir_key = slru_dir_to_key(kind); + let segnos: HashSet = segments + .iter() + .map(|(_path, segno, _size)| *segno) + .collect(); + let slrudir = SlruSegmentDirectory { segments: segnos }; + let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + slrudir_key, + Bytes::from(slrudir_buf), + ))); + + for (segpath, segno, size) in segments { + // SlruSegBlocks for each segment + let p = path.join(&segpath); + let file_size = size; + ensure!(file_size % 8192 == 0); + let nblocks = u32::try_from(file_size / 8192)?; + let start_key = slru_block_to_key(kind, segno, 0); + let end_key = slru_block_to_key(kind, segno, nblocks); + debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); + self.tasks + .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &p, + self.storage.clone(), + ))); + + // Followed by SlruSegSize + let segsize_key = slru_segment_size_to_key(kind, segno); + let segsize_buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + segsize_key, + Bytes::copy_from_slice(&segsize_buf), + ))); + } + Ok(()) + } +} + +// +// dbdir iteration tools +// + +struct PgDataDir { + pub dbs: Vec, // spcnode, dboid, path +} + +struct PgDataDirDb { + pub spcnode: u32, + pub dboid: u32, + pub path: RemotePath, + pub files: Vec, +} + +struct PgDataDirDbFile { + pub path: RemotePath, + pub rel_tag: RelTag, + pub segno: u32, + pub filesize: usize, + // Cummulative size of the given fork, set only for the last segment of that fork + pub nblocks: Option, +} + +impl PgDataDir { + async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result { + let datadir_path = storage.pgdata(); + // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first + // Traverse database in increasing oid order + + let basedir = &datadir_path.join("base"); + let db_oids: Vec<_> = storage + .listdir(basedir) + .await? + .into_iter() + .filter_map(|path| path.object_name().and_then(|name| name.parse::().ok())) + .sorted() + .collect(); + debug!(?db_oids, "found databases"); + let mut databases = Vec::new(); + for dboid in db_oids { + databases.push( + PgDataDirDb::new( + storage, + &basedir.join(dboid.to_string()), + pg_constants::DEFAULTTABLESPACE_OID, + dboid, + &datadir_path, + ) + .await?, + ); + } + + // special case for global catalogs + databases.push( + PgDataDirDb::new( + storage, + &datadir_path.join("global"), + postgres_ffi::pg_constants::GLOBALTABLESPACE_OID, + 0, + &datadir_path, + ) + .await?, + ); + + databases.sort_by_key(|db| (db.spcnode, db.dboid)); + + Ok(Self { dbs: databases }) + } +} + +impl PgDataDirDb { + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))] + async fn new( + storage: &RemoteStorageWrapper, + db_path: &RemotePath, + spcnode: u32, + dboid: u32, + datadir_path: &RemotePath, + ) -> anyhow::Result { + let mut files: Vec = storage + .listfilesindir(db_path) + .await? + .into_iter() + .filter_map(|(path, size)| { + debug!(%path, %size, "found file in dbdir"); + path.object_name().and_then(|name| { + // returns (relnode, forknum, segno) + parse_relfilename(name).ok().map(|x| (size, x)) + }) + }) + .sorted_by_key(|(_, relfilename)| *relfilename) + .map(|(filesize, (relnode, forknum, segno))| { + let rel_tag = RelTag { + spcnode, + dbnode: dboid, + relnode, + forknum, + }; + + let path = datadir_path.join(rel_tag.to_segfile_name(segno)); + assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error + let nblocks = filesize / BLCKSZ as usize; + + PgDataDirDbFile { + path, + filesize, + rel_tag, + segno, + nblocks: Some(nblocks), // first non-cummulative sizes + } + }) + .collect(); + + // Set cummulative sizes. Do all of that math here, so that later we could easier + // parallelize over segments and know with which segments we need to write relsize + // entry. + let mut cumulative_nblocks: usize = 0; + let mut prev_rel_tag: Option = None; + for i in 0..files.len() { + if prev_rel_tag == Some(files[i].rel_tag) { + cumulative_nblocks += files[i].nblocks.unwrap(); + } else { + cumulative_nblocks = files[i].nblocks.unwrap(); + } + + files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag { + Some(cumulative_nblocks) + } else { + None + }; + + prev_rel_tag = Some(files[i].rel_tag); + } + + Ok(PgDataDirDb { + files, + path: db_path.clone(), + spcnode, + dboid, + }) + } +} + +trait ImportTask { + fn key_range(&self) -> Range; + + fn total_size(&self) -> usize { + // TODO: revisit this + if is_contiguous_range(&self.key_range()) { + contiguous_range_len(&self.key_range()) as usize * 8192 + } else { + u32::MAX as usize + } + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result; +} + +struct ImportSingleKeyTask { + key: Key, + buf: Bytes, +} + +impl ImportSingleKeyTask { + fn new(key: Key, buf: Bytes) -> Self { + ImportSingleKeyTask { key, buf } + } +} + +impl ImportTask for ImportSingleKeyTask { + fn key_range(&self) -> Range { + singleton_range(self.key) + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + layer_writer.put_image(self.key, self.buf, ctx).await?; + Ok(1) + } +} + +struct ImportRelBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportRelBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportRelBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportRelBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))] + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing relation file"); + + let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; + let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?; + assert_eq!(rel_tag, rel_tag_end); + + let ranges = (start_blk..end_blk) + .enumerate() + .filter_map(|(i, blknum)| { + let key = rel_block_to_key(rel_tag, blknum); + if self.shard_identity.is_key_disposable(&key) { + return None; + } + let file_offset = i.checked_mul(8192).unwrap(); + Some(( + vec![key], + file_offset, + file_offset.checked_add(8192).unwrap(), + )) + }) + .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| { + assert_eq!(key.len(), 1); + assert!(!acc.is_empty()); + assert!(acc_end > acc_start); + if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ { + acc.push(key.pop().unwrap()); + Ok((acc, acc_start, end)) + } else { + Err(((acc, acc_start, acc_end), (key, start, end))) + } + }); + + let mut nimages = 0; + for (keys, range_start, range_end) in ranges { + let range_buf = self + .storage + .get_range(&self.path, range_start.into_u64(), range_end.into_u64()) + .await?; + let mut buf = Bytes::from(range_buf); + // TODO: batched writes + for key in keys { + let image = buf.split_to(8192); + layer_writer.put_image(key, image, ctx).await?; + nimages += 1; + } + } + + Ok(nimages) + } +} + +struct ImportSlruBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportSlruBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportSlruBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportSlruBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing SLRU segment file {}", self.path); + let buf = self.storage.get(&self.path).await?; + + let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?; + let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?; + let mut blknum = start_blk; + let mut nimages = 0; + let mut file_offset = 0; + while blknum < end_blk { + let key = slru_block_to_key(kind, segno, blknum); + assert!( + !self.shard_identity.is_key_disposable(&key), + "SLRU keys need to go into every shard" + ); + let buf = &buf[file_offset..(file_offset + 8192)]; + file_offset += 8192; + layer_writer + .put_image(key, Bytes::copy_from_slice(buf), ctx) + .await?; + blknum += 1; + nimages += 1; + } + Ok(nimages) + } +} + +enum AnyImportTask { + SingleKey(ImportSingleKeyTask), + RelBlocks(ImportRelBlocksTask), + SlruBlocks(ImportSlruBlocksTask), +} + +impl ImportTask for AnyImportTask { + fn key_range(&self) -> Range { + match self { + Self::SingleKey(t) => t.key_range(), + Self::RelBlocks(t) => t.key_range(), + Self::SlruBlocks(t) => t.key_range(), + } + } + /// returns the number of images put into the `layer_writer` + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + match self { + Self::SingleKey(t) => t.doit(layer_writer, ctx).await, + Self::RelBlocks(t) => t.doit(layer_writer, ctx).await, + Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await, + } + } +} + +impl From for AnyImportTask { + fn from(t: ImportSingleKeyTask) -> Self { + Self::SingleKey(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportRelBlocksTask) -> Self { + Self::RelBlocks(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportSlruBlocksTask) -> Self { + Self::SlruBlocks(t) + } +} + +struct ChunkProcessingJob { + timeline: Arc, + range: Range, + tasks: Vec, + + pgdata_lsn: Lsn, +} + +impl ChunkProcessingJob { + fn new(range: Range, tasks: Vec, env: &Flow) -> Self { + assert!(env.pgdata_lsn.is_valid()); + Self { + timeline: env.timeline.clone(), + range, + tasks, + pgdata_lsn: env.pgdata_lsn, + } + } + + async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + let mut writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + &self.range, + self.pgdata_lsn, + ctx, + ) + .await?; + + let mut nimages = 0; + for task in self.tasks { + nimages += task.doit(&mut writer, ctx).await?; + } + + let resident_layer = if nimages > 0 { + let (desc, path) = writer.finish(ctx).await?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + } else { + // dropping the writer cleans up + return Ok(()); + }; + + // this is sharing the same code as create_image_layers + let mut guard = self.timeline.layers.write().await; + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + crate::tenant::timeline::drop_wlock(guard); + + // Schedule the layer for upload but don't add barriers such as + // wait for completion or index upload, so we don't inhibit upload parallelism. + // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) + // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. + self.timeline + .remote_client + .schedule_layer_file_upload(resident_layer)?; + + Ok(()) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs new file mode 100644 index 0000000000..8d5ab1780f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -0,0 +1,315 @@ +use std::{ops::Bound, sync::Arc}; + +use anyhow::Context; +use bytes::Bytes; +use postgres_ffi::ControlFileData; +use remote_storage::{ + Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, +}; +use serde::de::DeserializeOwned; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, instrument}; +use utils::lsn::Lsn; + +use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; + +use super::{importbucket_format, index_part_format}; + +pub async fn new( + conf: &'static PageServerConf, + location: &index_part_format::Location, + cancel: CancellationToken, +) -> Result { + // FIXME: we probably want some timeout, and we might be able to assume the max file + // size on S3 is 1GiB (postgres segment size). But the problem is that the individual + // downloaders don't know enough about concurrent downloads to make a guess on the + // expected bandwidth and resulting best timeout. + let timeout = std::time::Duration::from_secs(24 * 60 * 60); + let location_storage = match location { + #[cfg(feature = "testing")] + index_part_format::Location::LocalFs { path } => { + GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?) + } + index_part_format::Location::AwsS3 { + region, + bucket, + key, + } => { + // TODO: think about security implications of letting the client specify the bucket & prefix. + // It's the most flexible right now, but, possibly we want to move bucket name into PS conf + // and force the timeline_id into the prefix? + GenericRemoteStorage::AwsS3(Arc::new( + remote_storage::S3Bucket::new( + &remote_storage::S3Config { + bucket_name: bucket.clone(), + prefix_in_bucket: Some(key.clone()), + bucket_region: region.clone(), + endpoint: conf + .import_pgdata_aws_endpoint_url + .clone() + .map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env + concurrency_limit: 100.try_into().unwrap(), // TODO: think about this + max_keys_per_list_response: Some(1000), // TODO: think about this + upload_storage_class: None, // irrelevant + }, + timeout, + ) + .await + .context("setup s3 bucket")?, + )) + } + }; + let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel); + Ok(storage_wrapper) +} + +/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API +/// such as [`tokio::fs`], which was used in the original implementation of the import code. +#[derive(Clone)] +pub struct RemoteStorageWrapper { + storage: GenericRemoteStorage, + cancel: CancellationToken, +} + +impl RemoteStorageWrapper { + pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self { + Self { storage, cancel } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listfilesindir( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes: _ } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, size, .. }| (key, size.into_usize())) + .collect(); + Ok(res) + }, + &format!("listfilesindir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listdir(&self, path: &RemotePath) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, .. }| key) + .chain(prefixes.into_iter()) + .collect(); + Ok(res) + }, + &format!("listdir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get(&self, path: &RemotePath) -> Result { + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download(path, &DownloadOpts::default(), &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + // XXX optimize this, can we get the capacity hint from somewhere? + let mut buf = Vec::new(); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(Bytes::from(buf)) + }, + &format!("download {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub async fn get_spec(&self) -> Result, anyhow::Error> { + self.get_json(&RemotePath::from_string("spec.json").unwrap()) + .await + .context("get spec") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_json( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + let buf = match self.get(path).await { + Ok(buf) => buf, + Err(DownloadError::NotFound) => return Ok(None), + Err(err) => return Err(err), + }; + let res = serde_json::from_slice(&buf) + .context("serialize") + // TODO: own error type + .map_err(DownloadError::Other)?; + Ok(Some(res)) + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> + where + T: serde::Serialize, + { + let buf = serde_json::to_vec(value)?; + let bytes = Bytes::from(buf); + utils::backoff::retry( + || async { + let size = bytes.len(); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); + self.storage + .upload_storage_object(bytes, size, path, &self.cancel) + .await + }, + remote_storage::TimeoutOrCancel::caused_by_cancel, + 1, + u32::MAX, + &format!("put json {path}"), + &self.cancel, + ) + .await + .expect("practically infinite retries") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_range( + &self, + path: &RemotePath, + start_inclusive: u64, + end_exclusive: u64, + ) -> Result, DownloadError> { + let len = end_exclusive + .checked_sub(start_inclusive) + .unwrap() + .into_usize(); + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download( + path, + &DownloadOpts { + etag: None, + byte_start: Bound::Included(start_inclusive), + byte_end: Bound::Excluded(end_exclusive) + }, + &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + let mut buf = Vec::with_capacity(len); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(buf) + }, + &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub fn pgdata(&self) -> RemotePath { + RemotePath::from_string("pgdata").unwrap() + } + + pub async fn get_control_file(&self) -> Result { + let control_file_path = self.pgdata().join("global/pg_control"); + info!("get control file from {control_file_path}"); + let control_file_buf = self.get(&control_file_path).await?; + ControlFile::new(control_file_buf) + } +} + +pub struct ControlFile { + control_file_data: ControlFileData, + control_file_buf: Bytes, +} + +impl ControlFile { + pub(crate) fn new(control_file_buf: Bytes) -> Result { + // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes. + let control_file_data = ControlFileData::decode(&control_file_buf)?; + let control_file = ControlFile { + control_file_data, + control_file_buf, + }; + control_file.try_pg_version()?; // so that we can offer infallible pg_version() + Ok(control_file) + } + pub(crate) fn base_lsn(&self) -> Lsn { + Lsn(self.control_file_data.checkPoint).align() + } + pub(crate) fn pg_version(&self) -> u32 { + self.try_pg_version() + .expect("prepare() checks that try_pg_version doesn't error") + } + pub(crate) fn control_file_data(&self) -> &ControlFileData { + &self.control_file_data + } + pub(crate) fn control_file_buf(&self) -> &Bytes { + &self.control_file_buf + } + fn try_pg_version(&self) -> anyhow::Result { + Ok(match self.control_file_data.catalog_version_no { + // thesea are from catversion.h + 202107181 => 14, + 202209061 => 15, + 202307071 => 16, + /* XXX pg17 */ + catversion => { + anyhow::bail!("unrecognized catalog version {catversion}") + } + }) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs new file mode 100644 index 0000000000..04ba3c6f1f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -0,0 +1,20 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct PgdataStatus { + pub done: bool, + // TODO: remaining fields +} + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardStatus { + pub done: bool, + // TODO: remaining fields +} + +// TODO: dedupe with fast_import code +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct Spec { + pub project_id: String, + pub branch_id: String, +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs new file mode 100644 index 0000000000..310d97a6a9 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -0,0 +1,68 @@ +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Root { + V1(V1), +} +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum V1 { + InProgress(InProgress), + Done(Done), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(transparent)] +pub struct IdempotencyKey(String); + +impl IdempotencyKey { + pub fn new(s: String) -> Self { + Self(s) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct InProgress { + pub idempotency_key: IdempotencyKey, + pub location: Location, + pub started_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct Done { + pub idempotency_key: IdempotencyKey, + pub started_at: chrono::NaiveDateTime, + pub finished_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Location { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + key: String, + }, +} + +impl Root { + pub fn is_done(&self) -> bool { + match self { + Root::V1(v1) => match v1 { + V1::Done(_) => true, + V1::InProgress(_) => false, + }, + } + } + pub fn idempotency_key(&self) -> &IdempotencyKey { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.idempotency_key, + V1::Done(done) => &done.idempotency_key, + }, + } + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs new file mode 100644 index 0000000000..c5210f9a30 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -0,0 +1,119 @@ +//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. +use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::error; + +use crate::config::PageServerConf; +use reqwest::Method; + +use super::importbucket_format::Spec; + +pub struct Client { + base_url: String, + authorization_header: Option, + client: reqwest::Client, + cancel: CancellationToken, +} + +pub type Result = std::result::Result; + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressRequest { + // no fields yet, not sure if there every will be any +} + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressResponse { + // we don't care +} + +impl Client { + pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result { + let Some(ref base_url) = conf.import_pgdata_upcall_api else { + anyhow::bail!("import_pgdata_upcall_api is not configured") + }; + Ok(Self { + base_url: base_url.to_string(), + client: reqwest::Client::new(), + cancel, + authorization_header: conf + .import_pgdata_upcall_api_token + .as_ref() + .map(|secret_string| secret_string.get_contents()) + .map(|jwt| format!("Bearer {jwt}")), + }) + } + + fn start_request( + &self, + method: Method, + uri: U, + ) -> reqwest::RequestBuilder { + let req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + } + } + + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; + let response = res.error_from_body().await?; + Ok(response) + } + + pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> { + let url = format!( + "{}/projects/{}/branches/{}/import_progress", + self.base_url, spec.project_id, spec.branch_id + ); + let ImportProgressResponse {} = self + .request(Method::POST, url, &ImportProgressRequest {}) + .await? + .json() + .await + .map_err(Error::ReceiveBody)?; + Ok(()) + } + + pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> { + loop { + match self.send_progress_once(spec).await { + Ok(()) => return Ok(()), + Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")), + Err(err) => { + error!(?err, "error sending progress, retrying"); + if tokio::time::timeout( + std::time::Duration::from_secs(10), + self.cancel.cancelled(), + ) + .await + .is_ok() + { + anyhow::bail!("cancelled while sending early progress update"); + } + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index a93bdde3f8..80a09b4840 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; use crate::{ context::RequestContext, @@ -23,14 +23,14 @@ use super::Timeline; pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, ) -> Self { Self { owning_tenant, @@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> { } } + pub(crate) fn finish_creation_myself(&mut self) -> (Arc, TimelineCreateGuard) { + self.raw_timeline.take().expect("already checked") + } + /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, @@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { /// A guard for timeline creations in process: as long as this object exists, the timeline ID /// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineCreateGuard<'t> { - owning_tenant: &'t Tenant, - timeline_id: TimelineId, +pub(crate) struct TimelineCreateGuard { + pub(crate) _tenant_gate_guard: GateGuard, + pub(crate) owning_tenant: Arc, + pub(crate) timeline_id: TimelineId, pub(crate) timeline_path: Utf8PathBuf, pub(crate) idempotency: CreateTimelineIdempotency, } @@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError { }, #[error("Already creating")] AlreadyCreating, + #[error("Shutting down")] + ShuttingDown, // e.g. I/O errors, or some failure deep in postgres initdb #[error(transparent)] Other(#[from] anyhow::Error), } -impl<'t> TimelineCreateGuard<'t> { +impl TimelineCreateGuard { pub(crate) fn new( - owning_tenant: &'t Tenant, + owning_tenant: &Arc, timeline_id: TimelineId, timeline_path: Utf8PathBuf, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, ) -> Result { + let _tenant_gate_guard = owning_tenant + .gate + .enter() + .map_err(|_| TimelineExclusionError::ShuttingDown)?; + // Lock order: this is the only place we take both locks. During drop() we only // lock creating_timelines let timelines = owning_tenant.timelines.lock().unwrap(); @@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> { return Err(TimelineExclusionError::AlreadyCreating); } creating_timelines.insert(timeline_id); + drop(creating_timelines); + drop(timelines_offloaded); + drop(timelines); Ok(Self { - owning_tenant, + _tenant_gate_guard, + owning_tenant: Arc::clone(owning_tenant), timeline_id, timeline_path, idempotency, @@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> { } } -impl Drop for TimelineCreateGuard<'_> { +impl Drop for TimelineCreateGuard { fn drop(&mut self) { self.owning_tenant .timelines_creating diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 592f41cb21..ef3aa759f3 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use std::collections::HashSet; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; -#[cfg(feature = "testing")] use utils::generation::Generation; // clippy warns that Uninitialized is much smaller than Initialized, which wastes @@ -38,6 +38,12 @@ impl UploadQueue { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub(crate) enum OpType { + MayReorder, + FlushDeletion, +} + /// This keeps track of queued and in-progress tasks. pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs @@ -88,6 +94,12 @@ pub(crate) struct UploadQueueInitialized { #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, + /// Ensure we order file operations correctly. + pub(crate) recently_deleted: HashSet<(LayerName, Generation)>, + + /// Deletions that are blocked by the tenant configuration + pub(crate) blocked_deletions: Vec, + /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -180,6 +192,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -220,6 +234,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -270,15 +286,15 @@ pub(crate) struct UploadTask { /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct Delete { pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] pub(crate) enum UploadOp { - /// Upload a layer file - UploadLayer(ResidentLayer, LayerFileMetadata), + /// Upload a layer file. The last field indicates the last operation for thie file. + UploadLayer(ResidentLayer, LayerFileMetadata, Option), /// Upload a index_part.json file UploadMetadata { @@ -300,11 +316,11 @@ pub(crate) enum UploadOp { impl std::fmt::Display for UploadOp { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - UploadOp::UploadLayer(layer, metadata) => { + UploadOp::UploadLayer(layer, metadata, mode) => { write!( f, - "UploadLayer({}, size={:?}, gen={:?})", - layer, metadata.file_size, metadata.generation + "UploadLayer({}, size={:?}, gen={:?}, mode={:?})", + layer, metadata.file_size, metadata.generation, mode ) } UploadOp::UploadMetadata { uploaded, .. } => { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index daa8b99ab0..b9f8c7ea20 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -175,10 +175,16 @@ impl VirtualFile { } pub async fn sync_all(&self) -> Result<(), Error> { + if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 { + return Ok(()); + } self.inner.sync_all().await } pub async fn sync_data(&self) -> Result<(), Error> { + if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 { + return Ok(()); + } self.inner.sync_data().await } @@ -233,6 +239,27 @@ impl VirtualFile { } } +/// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing +/// files. Switching this off is unsafe and only used for testing on machines +/// with slow drives. +#[repr(u8)] +pub enum SyncMode { + Sync, + UnsafeNoSync, +} + +impl TryFrom for SyncMode { + type Error = u8; + + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (SyncMode::Sync as u8) => SyncMode::Sync, + v if v == (SyncMode::UnsafeNoSync as u8) => SyncMode::UnsafeNoSync, + x => return Err(x), + }) + } +} + /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -1332,12 +1359,13 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) { +pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: SyncMode) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } set_io_mode(mode); io_engine::init(engine); + SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1379,6 +1407,9 @@ pub(crate) fn set_io_mode(mode: IoMode) { pub(crate) fn get_io_mode() -> IoMode { IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap() } + +static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); + #[cfg(test)] mod tests { use crate::context::DownloadBehavior; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 38d69760f2..ad6ccbc854 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1528,6 +1528,11 @@ mod tests { assert_current_logical_size(&tline, Lsn(0x50)); + let test_span = tracing::info_span!(parent: None, "test", + tenant_id=%tline.tenant_shard_id.tenant_id, + shard_id=%tline.tenant_shard_id.shard_slug(), + timeline_id=%tline.timeline_id); + // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline @@ -1562,6 +1567,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 2") ); @@ -1569,6 +1575,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); @@ -1576,12 +1583,14 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); @@ -1589,18 +1598,21 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") ); @@ -1623,12 +1635,14 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); @@ -1643,6 +1657,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") ); @@ -1675,12 +1690,14 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) + .instrument(test_span.clone()) .await?, ZERO_PAGE ); assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 1") ); @@ -1701,6 +1718,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) + .instrument(test_span.clone()) .await?, ZERO_PAGE ); @@ -1708,6 +1726,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) + .instrument(test_span.clone()) .await?, test_img("foo blk 1500") ); @@ -1815,6 +1834,11 @@ mod tests { } m.commit(&ctx).await?; + let test_span = tracing::info_span!(parent: None, "test", + tenant_id=%tline.tenant_shard_id.tenant_id, + shard_id=%tline.tenant_shard_id.shard_slug(), + timeline_id=%tline.timeline_id); + // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline @@ -1847,6 +1871,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) + .instrument(test_span.clone()) .await?, test_img(&data) ); @@ -1874,6 +1899,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) + .instrument(test_span.clone()) .await?, test_img(&data) ); @@ -1892,6 +1918,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) + .instrument(test_span.clone()) .await?, test_img(&data) ); @@ -1928,6 +1955,7 @@ mod tests { assert_eq!( tline .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) + .instrument(test_span.clone()) .await?, test_img(&data) ); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index f207ed61f9..51b9f58bbc 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -421,9 +421,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); - WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; - LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; - SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); InitLogicalReplicationMonitor(); diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 37abb3fa03..619b7255ae 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -453,7 +453,6 @@ WalRedoMain(int argc, char *argv[]) static void CreateFakeSharedMemoryAndSemaphores(void) { - PGShmemHeader *shim = NULL; PGShmemHeader *hdr; Size size; int numSemas; @@ -486,7 +485,6 @@ CreateFakeSharedMemoryAndSemaphores(void) hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); - shim = hdr; UsedShmemSegAddr = hdr; UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */ } @@ -523,8 +521,6 @@ CreateFakeSharedMemoryAndSemaphores(void) */ InitShmemIndex(); - dsm_shmem_init(); - /* * Set up xlog, clog, and buffers */ @@ -599,10 +595,6 @@ CreateFakeSharedMemoryAndSemaphores(void) ShmemBackendArrayAllocation(); #endif - /* Initialize dynamic shared memory facilities. */ - if (!IsUnderPostmaster) - dsm_postmaster_startup(shim); - /* * Now give loadable modules a chance to set up their shmem allocations */ diff --git a/poetry.lock b/poetry.lock index d869761e8e..e2fca7be47 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -13,97 +13,111 @@ files = [ [[package]] name = "aiohttp" -version = "3.10.2" +version = "3.10.11" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, - {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, - {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, - {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, - {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, - {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, - {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, - {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, - {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, - {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, - {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, - {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, - {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, - {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, - {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, - {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, - {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, - {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, - {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, - {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, - {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, - {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, - {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, - {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, - {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, - {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, - {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, - {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, - {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, - {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, - {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, - {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, - {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, - {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, - {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, - {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, - {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, - {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, - {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, - {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, - {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, - {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, - {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, - {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, - {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, - {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, - {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, - {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, - {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, - {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, - {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, - {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, - {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, - {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, - {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, - {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, - {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, - {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, - {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, - {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, - {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, - {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, - {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, - {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, - {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, - {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, - {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, - {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, - {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, - {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, - {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, - {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, - {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, - {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, - {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, - {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, + {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"}, + {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"}, + {file = "aiohttp-3.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffbfde2443696345e23a3c597049b1dd43049bb65337837574205e7368472177"}, + {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20b3d9e416774d41813bc02fdc0663379c01817b0874b932b81c7f777f67b217"}, + {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b943011b45ee6bf74b22245c6faab736363678e910504dd7531a58c76c9015a"}, + {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48bc1d924490f0d0b3658fe5c4b081a4d56ebb58af80a6729d4bd13ea569797a"}, + {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e12eb3f4b1f72aaaf6acd27d045753b18101524f72ae071ae1c91c1cd44ef115"}, + {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f14ebc419a568c2eff3c1ed35f634435c24ead2fe19c07426af41e7adb68713a"}, + {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:72b191cdf35a518bfc7ca87d770d30941decc5aaf897ec8b484eb5cc8c7706f3"}, + {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ab2328a61fdc86424ee540d0aeb8b73bbcad7351fb7cf7a6546fc0bcffa0038"}, + {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa93063d4af05c49276cf14e419550a3f45258b6b9d1f16403e777f1addf4519"}, + {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:30283f9d0ce420363c24c5c2421e71a738a2155f10adbb1a11a4d4d6d2715cfc"}, + {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e5358addc8044ee49143c546d2182c15b4ac3a60be01c3209374ace05af5733d"}, + {file = "aiohttp-3.10.11-cp310-cp310-win32.whl", hash = "sha256:e1ffa713d3ea7cdcd4aea9cddccab41edf6882fa9552940344c44e59652e1120"}, + {file = "aiohttp-3.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:778cbd01f18ff78b5dd23c77eb82987ee4ba23408cbed233009fd570dda7e674"}, + {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:80ff08556c7f59a7972b1e8919f62e9c069c33566a6d28586771711e0eea4f07"}, + {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c8f96e9ee19f04c4914e4e7a42a60861066d3e1abf05c726f38d9d0a466e695"}, + {file = "aiohttp-3.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fb8601394d537da9221947b5d6e62b064c9a43e88a1ecd7414d21a1a6fba9c24"}, + {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ea224cf7bc2d8856d6971cea73b1d50c9c51d36971faf1abc169a0d5f85a382"}, + {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db9503f79e12d5d80b3efd4d01312853565c05367493379df76d2674af881caa"}, + {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0f449a50cc33f0384f633894d8d3cd020e3ccef81879c6e6245c3c375c448625"}, + {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82052be3e6d9e0c123499127782a01a2b224b8af8c62ab46b3f6197035ad94e9"}, + {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20063c7acf1eec550c8eb098deb5ed9e1bb0521613b03bb93644b810986027ac"}, + {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:489cced07a4c11488f47aab1f00d0c572506883f877af100a38f1fedaa884c3a"}, + {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea9b3bab329aeaa603ed3bf605f1e2a6f36496ad7e0e1aa42025f368ee2dc07b"}, + {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ca117819d8ad113413016cb29774b3f6d99ad23c220069789fc050267b786c16"}, + {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2dfb612dcbe70fb7cdcf3499e8d483079b89749c857a8f6e80263b021745c730"}, + {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9b615d3da0d60e7d53c62e22b4fd1c70f4ae5993a44687b011ea3a2e49051b8"}, + {file = "aiohttp-3.10.11-cp311-cp311-win32.whl", hash = "sha256:29103f9099b6068bbdf44d6a3d090e0a0b2be6d3c9f16a070dd9d0d910ec08f9"}, + {file = "aiohttp-3.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:236b28ceb79532da85d59aa9b9bf873b364e27a0acb2ceaba475dc61cffb6f3f"}, + {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7480519f70e32bfb101d71fb9a1f330fbd291655a4c1c922232a48c458c52710"}, + {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f65267266c9aeb2287a6622ee2bb39490292552f9fbf851baabc04c9f84e048d"}, + {file = "aiohttp-3.10.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7400a93d629a0608dc1d6c55f1e3d6e07f7375745aaa8bd7f085571e4d1cee97"}, + {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f34b97e4b11b8d4eb2c3a4f975be626cc8af99ff479da7de49ac2c6d02d35725"}, + {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7b825da878464a252ccff2958838f9caa82f32a8dbc334eb9b34a026e2c636"}, + {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f92a344c50b9667827da308473005f34767b6a2a60d9acff56ae94f895f385"}, + {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc6f1ab987a27b83c5268a17218463c2ec08dbb754195113867a27b166cd6087"}, + {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1dc0f4ca54842173d03322793ebcf2c8cc2d34ae91cc762478e295d8e361e03f"}, + {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7ce6a51469bfaacff146e59e7fb61c9c23006495d11cc24c514a455032bcfa03"}, + {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aad3cd91d484d065ede16f3cf15408254e2469e3f613b241a1db552c5eb7ab7d"}, + {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f4df4b8ca97f658c880fb4b90b1d1ec528315d4030af1ec763247ebfd33d8b9a"}, + {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2e4e18a0a2d03531edbc06c366954e40a3f8d2a88d2b936bbe78a0c75a3aab3e"}, + {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6ce66780fa1a20e45bc753cda2a149daa6dbf1561fc1289fa0c308391c7bc0a4"}, + {file = "aiohttp-3.10.11-cp312-cp312-win32.whl", hash = "sha256:a919c8957695ea4c0e7a3e8d16494e3477b86f33067478f43106921c2fef15bb"}, + {file = "aiohttp-3.10.11-cp312-cp312-win_amd64.whl", hash = "sha256:b5e29706e6389a2283a91611c91bf24f218962717c8f3b4e528ef529d112ee27"}, + {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:703938e22434d7d14ec22f9f310559331f455018389222eed132808cd8f44127"}, + {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9bc50b63648840854e00084c2b43035a62e033cb9b06d8c22b409d56eb098413"}, + {file = "aiohttp-3.10.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f0463bf8b0754bc744e1feb61590706823795041e63edf30118a6f0bf577461"}, + {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6c6dec398ac5a87cb3a407b068e1106b20ef001c344e34154616183fe684288"}, + {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcaf2d79104d53d4dcf934f7ce76d3d155302d07dae24dff6c9fffd217568067"}, + {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:25fd5470922091b5a9aeeb7e75be609e16b4fba81cdeaf12981393fb240dd10e"}, + {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbde2ca67230923a42161b1f408c3992ae6e0be782dca0c44cb3206bf330dee1"}, + {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249c8ff8d26a8b41a0f12f9df804e7c685ca35a207e2410adbd3e924217b9006"}, + {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:878ca6a931ee8c486a8f7b432b65431d095c522cbeb34892bee5be97b3481d0f"}, + {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8663f7777ce775f0413324be0d96d9730959b2ca73d9b7e2c2c90539139cbdd6"}, + {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6cd3f10b01f0c31481fba8d302b61603a2acb37b9d30e1d14e0f5a58b7b18a31"}, + {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e8d8aad9402d3aa02fdc5ca2fe68bcb9fdfe1f77b40b10410a94c7f408b664d"}, + {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:38e3c4f80196b4f6c3a85d134a534a56f52da9cb8d8e7af1b79a32eefee73a00"}, + {file = "aiohttp-3.10.11-cp313-cp313-win32.whl", hash = "sha256:fc31820cfc3b2863c6e95e14fcf815dc7afe52480b4dc03393c4873bb5599f71"}, + {file = "aiohttp-3.10.11-cp313-cp313-win_amd64.whl", hash = "sha256:4996ff1345704ffdd6d75fb06ed175938c133425af616142e7187f28dc75f14e"}, + {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:74baf1a7d948b3d640badeac333af581a367ab916b37e44cf90a0334157cdfd2"}, + {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:473aebc3b871646e1940c05268d451f2543a1d209f47035b594b9d4e91ce8339"}, + {file = "aiohttp-3.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c2f746a6968c54ab2186574e15c3f14f3e7f67aef12b761e043b33b89c5b5f95"}, + {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d110cabad8360ffa0dec8f6ec60e43286e9d251e77db4763a87dcfe55b4adb92"}, + {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0099c7d5d7afff4202a0c670e5b723f7718810000b4abcbc96b064129e64bc7"}, + {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0316e624b754dbbf8c872b62fe6dcb395ef20c70e59890dfa0de9eafccd2849d"}, + {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a5f7ab8baf13314e6b2485965cbacb94afff1e93466ac4d06a47a81c50f9cca"}, + {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c891011e76041e6508cbfc469dd1a8ea09bc24e87e4c204e05f150c4c455a5fa"}, + {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9208299251370ee815473270c52cd3f7069ee9ed348d941d574d1457d2c73e8b"}, + {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:459f0f32c8356e8125f45eeff0ecf2b1cb6db1551304972702f34cd9e6c44658"}, + {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:14cdc8c1810bbd4b4b9f142eeee23cda528ae4e57ea0923551a9af4820980e39"}, + {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:971aa438a29701d4b34e4943e91b5e984c3ae6ccbf80dd9efaffb01bd0b243a9"}, + {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9a309c5de392dfe0f32ee57fa43ed8fc6ddf9985425e84bd51ed66bb16bce3a7"}, + {file = "aiohttp-3.10.11-cp38-cp38-win32.whl", hash = "sha256:9ec1628180241d906a0840b38f162a3215114b14541f1a8711c368a8739a9be4"}, + {file = "aiohttp-3.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:9c6e0ffd52c929f985c7258f83185d17c76d4275ad22e90aa29f38e211aacbec"}, + {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc493a2e5d8dc79b2df5bec9558425bcd39aff59fc949810cbd0832e294b106"}, + {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3e70f24e7d0405be2348da9d5a7836936bf3a9b4fd210f8c37e8d48bc32eca6"}, + {file = "aiohttp-3.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:968b8fb2a5eee2770eda9c7b5581587ef9b96fbdf8dcabc6b446d35ccc69df01"}, + {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deef4362af9493d1382ef86732ee2e4cbc0d7c005947bd54ad1a9a16dd59298e"}, + {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:686b03196976e327412a1b094f4120778c7c4b9cff9bce8d2fdfeca386b89829"}, + {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bf6d027d9d1d34e1c2e1645f18a6498c98d634f8e373395221121f1c258ace8"}, + {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:099fd126bf960f96d34a760e747a629c27fb3634da5d05c7ef4d35ef4ea519fc"}, + {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c73c4d3dae0b4644bc21e3de546530531d6cdc88659cdeb6579cd627d3c206aa"}, + {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0c5580f3c51eea91559db3facd45d72e7ec970b04528b4709b1f9c2555bd6d0b"}, + {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fdf6429f0caabfd8a30c4e2eaecb547b3c340e4730ebfe25139779b9815ba138"}, + {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d97187de3c276263db3564bb9d9fad9e15b51ea10a371ffa5947a5ba93ad6777"}, + {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0acafb350cfb2eba70eb5d271f55e08bd4502ec35e964e18ad3e7d34d71f7261"}, + {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c13ed0c779911c7998a58e7848954bd4d63df3e3575f591e321b19a2aec8df9f"}, + {file = "aiohttp-3.10.11-cp39-cp39-win32.whl", hash = "sha256:22b7c540c55909140f63ab4f54ec2c20d2635c0289cdd8006da46f3327f971b9"}, + {file = "aiohttp-3.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:7b26b1551e481012575dab8e3727b16fe7dd27eb2711d2e63ced7368756268fb"}, + {file = "aiohttp-3.10.11.tar.gz", hash = "sha256:9dc2b8f3dcab2e39e0fa309c8da50c3b55e6f34ab25f1a71d3288f24924d33a7"}, ] [package.dependencies] aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" -async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" frozenlist = ">=1.1.1" multidict = ">=4.5,<7.0" -yarl = ">=1.0,<2.0" +yarl = ">=1.12.0,<2.0" [package.extras] speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] @@ -204,10 +218,8 @@ files = [ ] [package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} idna = ">=2.8" sniffio = ">=1.1" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] @@ -722,10 +734,7 @@ files = [ [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = [ - {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, - {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, -] +urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""} [package.extras] crt = ["awscrt (==0.19.19)"] @@ -1054,20 +1063,6 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] ssh = ["paramiko (>=2.4.3)"] websockets = ["websocket-client (>=1.3.0)"] -[[package]] -name = "exceptiongroup" -version = "1.1.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "execnet" version = "1.9.0" @@ -1095,7 +1090,6 @@ files = [ [package.dependencies] click = ">=8.0" -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" Werkzeug = ">=2.2.2" @@ -1304,25 +1298,6 @@ files = [ {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] -[[package]] -name = "importlib-metadata" -version = "4.12.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, - {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, -] - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] -perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -1883,48 +1858,54 @@ files = [ [[package]] name = "mypy" -version = "1.3.0" +version = "1.13.0" description = "Optional static typing for Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"}, - {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"}, - {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"}, - {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"}, - {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"}, - {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"}, - {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"}, - {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"}, - {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"}, - {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"}, - {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"}, - {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"}, - {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"}, - {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"}, - {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"}, - {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"}, - {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"}, - {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"}, - {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"}, - {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=3.10" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] install-types = ["pip"] -python2 = ["typed-ast (>=1.4.0,<2)"] +mypyc = ["setuptools (>=50)"] reports = ["lxml"] [[package]] @@ -2078,6 +2059,113 @@ files = [ [package.extras] twisted = ["twisted"] +[[package]] +name = "propcache" +version = "0.2.0" +description = "Accelerated property cache" +optional = false +python-versions = ">=3.8" +files = [ + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"}, + {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"}, + {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"}, + {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"}, + {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"}, + {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"}, + {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"}, + {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"}, + {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"}, + {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"}, + {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"}, + {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"}, + {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"}, + {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"}, + {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, +] + [[package]] name = "psutil" version = "5.9.4" @@ -2392,11 +2480,9 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] @@ -2459,10 +2545,7 @@ files = [ ] [package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] +pytest = {version = ">=6.2.4", markers = "python_version >= \"3.10\""} [[package]] name = "pytest-repeat" @@ -2970,17 +3053,6 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - [[package]] name = "types-jwcrypto" version = "1.5.0.20240925" @@ -3237,16 +3309,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3307,121 +3369,99 @@ files = [ [[package]] name = "yarl" -version = "1.9.4" +version = "1.17.2" description = "Yet another URL library" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"}, - {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"}, - {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"}, - {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"}, - {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"}, - {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"}, - {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"}, - {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"}, - {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"}, - {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"}, - {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"}, - {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"}, - {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"}, - {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"}, - {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"}, - {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"}, - {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"}, - {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"}, - {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"}, - {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"}, - {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"}, - {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"}, - {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"}, - {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"}, - {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"}, - {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"}, - {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"}, - {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"}, - {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"}, - {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"}, - {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"}, - {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"}, - {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"}, - {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"}, + {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"}, + {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"}, + {file = "yarl-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:170ed4971bf9058582b01a8338605f4d8c849bd88834061e60e83b52d0c76870"}, + {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc61b005f6521fcc00ca0d1243559a5850b9dd1e1fe07b891410ee8fe192d0c0"}, + {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:871e1b47eec7b6df76b23c642a81db5dd6536cbef26b7e80e7c56c2fd371382e"}, + {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3a58a2f2ca7aaf22b265388d40232f453f67a6def7355a840b98c2d547bd037f"}, + {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:736bb076f7299c5c55dfef3eb9e96071a795cb08052822c2bb349b06f4cb2e0a"}, + {file = "yarl-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8fd51299e21da709eabcd5b2dd60e39090804431292daacbee8d3dabe39a6bc0"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:358dc7ddf25e79e1cc8ee16d970c23faee84d532b873519c5036dbb858965795"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:50d866f7b1a3f16f98603e095f24c0eeba25eb508c85a2c5939c8b3870ba2df8"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:8b9c4643e7d843a0dca9cd9d610a0876e90a1b2cbc4c5ba7930a0d90baf6903f"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d63123bfd0dce5f91101e77c8a5427c3872501acece8c90df457b486bc1acd47"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:4e76381be3d8ff96a4e6c77815653063e87555981329cf8f85e5be5abf449021"}, + {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:734144cd2bd633a1516948e477ff6c835041c0536cef1d5b9a823ae29899665b"}, + {file = "yarl-1.17.2-cp310-cp310-win32.whl", hash = "sha256:26bfb6226e0c157af5da16d2d62258f1ac578d2899130a50433ffee4a5dfa673"}, + {file = "yarl-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:76499469dcc24759399accd85ec27f237d52dec300daaca46a5352fcbebb1071"}, + {file = "yarl-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:792155279dc093839e43f85ff7b9b6493a8eaa0af1f94f1f9c6e8f4de8c63500"}, + {file = "yarl-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:38bc4ed5cae853409cb193c87c86cd0bc8d3a70fd2268a9807217b9176093ac6"}, + {file = "yarl-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4a8c83f6fcdc327783bdc737e8e45b2e909b7bd108c4da1892d3bc59c04a6d84"}, + {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6d5fed96f0646bfdf698b0a1cebf32b8aae6892d1bec0c5d2d6e2df44e1e2d"}, + {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:782ca9c58f5c491c7afa55518542b2b005caedaf4685ec814fadfcee51f02493"}, + {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ff6af03cac0d1a4c3c19e5dcc4c05252411bf44ccaa2485e20d0a7c77892ab6e"}, + {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a3f47930fbbed0f6377639503848134c4aa25426b08778d641491131351c2c8"}, + {file = "yarl-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1fa68a3c921365c5745b4bd3af6221ae1f0ea1bf04b69e94eda60e57958907f"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:187df91395c11e9f9dc69b38d12406df85aa5865f1766a47907b1cc9855b6303"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:93d1c8cc5bf5df401015c5e2a3ce75a5254a9839e5039c881365d2a9dcfc6dc2"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:11d86c6145ac5c706c53d484784cf504d7d10fa407cb73b9d20f09ff986059ef"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c42774d1d1508ec48c3ed29e7b110e33f5e74a20957ea16197dbcce8be6b52ba"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8e589379ef0407b10bed16cc26e7392ef8f86961a706ade0a22309a45414d7"}, + {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1056cadd5e850a1c026f28e0704ab0a94daaa8f887ece8dfed30f88befb87bb0"}, + {file = "yarl-1.17.2-cp311-cp311-win32.whl", hash = "sha256:be4c7b1c49d9917c6e95258d3d07f43cfba2c69a6929816e77daf322aaba6628"}, + {file = "yarl-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:ac8eda86cc75859093e9ce390d423aba968f50cf0e481e6c7d7d63f90bae5c9c"}, + {file = "yarl-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dd90238d3a77a0e07d4d6ffdebc0c21a9787c5953a508a2231b5f191455f31e9"}, + {file = "yarl-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c74f0b0472ac40b04e6d28532f55cac8090e34c3e81f118d12843e6df14d0909"}, + {file = "yarl-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d486ddcaca8c68455aa01cf53d28d413fb41a35afc9f6594a730c9779545876"}, + {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25b7e93f5414b9a983e1a6c1820142c13e1782cc9ed354c25e933aebe97fcf2"}, + {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a0baff7827a632204060f48dca9e63fbd6a5a0b8790c1a2adfb25dc2c9c0d50"}, + {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:460024cacfc3246cc4d9f47a7fc860e4fcea7d1dc651e1256510d8c3c9c7cde0"}, + {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5870d620b23b956f72bafed6a0ba9a62edb5f2ef78a8849b7615bd9433384171"}, + {file = "yarl-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2941756754a10e799e5b87e2319bbec481ed0957421fba0e7b9fb1c11e40509f"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9611b83810a74a46be88847e0ea616794c406dbcb4e25405e52bff8f4bee2d0a"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:cd7e35818d2328b679a13268d9ea505c85cd773572ebb7a0da7ccbca77b6a52e"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6b981316fcd940f085f646b822c2ff2b8b813cbd61281acad229ea3cbaabeb6b"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:688058e89f512fb7541cb85c2f149c292d3fa22f981d5a5453b40c5da49eb9e8"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:56afb44a12b0864d17b597210d63a5b88915d680f6484d8d202ed68ade38673d"}, + {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:17931dfbb84ae18b287279c1f92b76a3abcd9a49cd69b92e946035cff06bcd20"}, + {file = "yarl-1.17.2-cp312-cp312-win32.whl", hash = "sha256:ff8d95e06546c3a8c188f68040e9d0360feb67ba8498baf018918f669f7bc39b"}, + {file = "yarl-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:4c840cc11163d3c01a9d8aad227683c48cd3e5be5a785921bcc2a8b4b758c4f3"}, + {file = "yarl-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3294f787a437cb5d81846de3a6697f0c35ecff37a932d73b1fe62490bef69211"}, + {file = "yarl-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1e7fedb09c059efee2533119666ca7e1a2610072076926fa028c2ba5dfeb78c"}, + {file = "yarl-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:da9d3061e61e5ae3f753654813bc1cd1c70e02fb72cf871bd6daf78443e9e2b1"}, + {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c012dceadc695ccf69301bfdccd1fc4472ad714fe2dd3c5ab4d2046afddf29"}, + {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f11fd61d72d93ac23718d393d2a64469af40be2116b24da0a4ca6922df26807e"}, + {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46c465ad06971abcf46dd532f77560181387b4eea59084434bdff97524444032"}, + {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef6eee1a61638d29cd7c85f7fd3ac7b22b4c0fabc8fd00a712b727a3e73b0685"}, + {file = "yarl-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4434b739a8a101a837caeaa0137e0e38cb4ea561f39cb8960f3b1e7f4967a3fc"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:752485cbbb50c1e20908450ff4f94217acba9358ebdce0d8106510859d6eb19a"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:17791acaa0c0f89323c57da7b9a79f2174e26d5debbc8c02d84ebd80c2b7bff8"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5c6ea72fe619fee5e6b5d4040a451d45d8175f560b11b3d3e044cd24b2720526"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db5ac3871ed76340210fe028f535392f097fb31b875354bcb69162bba2632ef4"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7a1606ba68e311576bcb1672b2a1543417e7e0aa4c85e9e718ba6466952476c0"}, + {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9bc27dd5cfdbe3dc7f381b05e6260ca6da41931a6e582267d5ca540270afeeb2"}, + {file = "yarl-1.17.2-cp313-cp313-win32.whl", hash = "sha256:52492b87d5877ec405542f43cd3da80bdcb2d0c2fbc73236526e5f2c28e6db28"}, + {file = "yarl-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:8e1bf59e035534ba4077f5361d8d5d9194149f9ed4f823d1ee29ef3e8964ace3"}, + {file = "yarl-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c556fbc6820b6e2cda1ca675c5fa5589cf188f8da6b33e9fc05b002e603e44fa"}, + {file = "yarl-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f2f44a4247461965fed18b2573f3a9eb5e2c3cad225201ee858726cde610daca"}, + {file = "yarl-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a3ede8c248f36b60227eb777eac1dbc2f1022dc4d741b177c4379ca8e75571a"}, + {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2654caaf5584449d49c94a6b382b3cb4a246c090e72453493ea168b931206a4d"}, + {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0d41c684f286ce41fa05ab6af70f32d6da1b6f0457459a56cf9e393c1c0b2217"}, + {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2270d590997445a0dc29afa92e5534bfea76ba3aea026289e811bf9ed4b65a7f"}, + {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18662443c6c3707e2fc7fad184b4dc32dd428710bbe72e1bce7fe1988d4aa654"}, + {file = "yarl-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:75ac158560dec3ed72f6d604c81090ec44529cfb8169b05ae6fcb3e986b325d9"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1fee66b32e79264f428dc8da18396ad59cc48eef3c9c13844adec890cd339db5"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:585ce7cd97be8f538345de47b279b879e091c8b86d9dbc6d98a96a7ad78876a3"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:c019abc2eca67dfa4d8fb72ba924871d764ec3c92b86d5b53b405ad3d6aa56b0"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c6e659b9a24d145e271c2faf3fa6dd1fcb3e5d3f4e17273d9e0350b6ab0fe6e2"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:d17832ba39374134c10e82d137e372b5f7478c4cceeb19d02ae3e3d1daed8721"}, + {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bc3003710e335e3f842ae3fd78efa55f11a863a89a72e9a07da214db3bf7e1f8"}, + {file = "yarl-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f5ffc6b7ace5b22d9e73b2a4c7305740a339fbd55301d52735f73e21d9eb3130"}, + {file = "yarl-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:48e424347a45568413deec6f6ee2d720de2cc0385019bedf44cd93e8638aa0ed"}, + {file = "yarl-1.17.2-py3-none-any.whl", hash = "sha256:dd7abf4f717e33b7487121faf23560b3a50924f80e4bef62b22dab441ded8f3b"}, + {file = "yarl-1.17.2.tar.gz", hash = "sha256:753eaaa0c7195244c84b5cc159dc8204b7fd99f716f11198f999f2332a86b178"}, ] [package.dependencies] idna = ">=2.0" multidict = ">=4.0" - -[[package]] -name = "zipp" -version = "3.19.1" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, - {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, -] - -[package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] +propcache = ">=0.2.0" [[package]] name = "zstandard" @@ -3483,5 +3523,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508" +python-versions = "^3.11" +content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486" diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 6d26c99832..491b272ac4 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,17 +1,17 @@ use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use super::{ComputeCredentials, ComputeUserInfo}; use crate::auth::backend::ComputeCredentialKeys; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::stream::{PqStream, Stream}; use crate::{compute, sasl}; pub(super) async fn authenticate( - ctx: &RequestMonitoring, + ctx: &RequestContext, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, @@ -21,11 +21,11 @@ pub(super) async fn authenticate( let scram_keys = match secret { #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { - info!("auth endpoint chooses MD5"); + debug!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthSecret::Scram(secret) => { - info!("auth endpoint chooses SCRAM"); + debug!("auth endpoint chooses SCRAM"); let scram = auth::Scram(&secret, ctx); let auth_outcome = tokio::time::timeout( @@ -50,6 +50,8 @@ pub(super) async fn authenticate( let client_key = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { + // TODO: warnings? + // TODO: should we get rid of this because double logging? info!("auth backend failed with an error: {reason}"); return Err(auth::AuthError::password_failed(&*creds.user)); } diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index e25dc3d45e..bf7a1cb070 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -6,9 +6,10 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; +use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; use crate::proxy::connect_compute::ComputeConnectBackend; @@ -71,13 +72,13 @@ impl ConsoleRedirectBackend { pub(crate) async fn authenticate( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { + ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(ConsoleRedirectNodeInfo) + .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) } } @@ -87,7 +88,7 @@ pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); impl ComputeConnectBackend for ConsoleRedirectNodeInfo { async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, ) -> Result { Ok(Cached::new_uncached(self.0.clone())) } @@ -98,11 +99,11 @@ impl ComputeConnectBackend for ConsoleRedirectNodeInfo { } async fn authenticate( - ctx: &RequestMonitoring, + ctx: &RequestContext, auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result { +) -> auth::Result<(NodeInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -176,9 +177,12 @@ async fn authenticate( config.password(password.as_ref()); } - Ok(NodeInfo { - config, - aux: db_info.aux, - allow_self_signed_compute: false, // caller may override - }) + Ok(( + NodeInfo { + config, + aux: db_info.aux, + allow_self_signed_compute: false, // caller may override + }, + db_info.allowed_ips, + )) } diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 1411d908a5..3316543022 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -4,7 +4,7 @@ use tracing::{debug, info}; use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use crate::auth::{self, AuthFlow}; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::sasl; @@ -15,7 +15,7 @@ use crate::stream::{self, Stream}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub(crate) async fn authenticate_cleartext( - ctx: &RequestMonitoring, + ctx: &RequestContext, info: ComputeUserInfo, client: &mut stream::PqStream>, secret: AuthSecret, @@ -57,7 +57,7 @@ pub(crate) async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub(crate) async fn password_hack_no_authentication( - ctx: &RequestMonitoring, + ctx: &RequestContext, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result<(ComputeUserInfo, Vec)> { @@ -73,7 +73,7 @@ pub(crate) async fn password_hack_no_authentication( .get_password() .await?; - info!(project = &*payload.endpoint, "received missing parameter"); + debug!(project = &*payload.endpoint, "received missing parameter"); // Report tentative success; compute node will check the password anyway. Ok(( diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index bfc674139b..517d4fd34b 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -17,7 +17,7 @@ use thiserror::Error; use tokio::time::Instant; use crate::auth::backend::ComputeCredentialKeys; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::errors::GetEndpointJwksError; use crate::http::read_body_with_limit; use crate::intern::RoleNameInt; @@ -39,7 +39,7 @@ const JWKS_FETCH_RETRIES: u32 = 3; pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { fn fetch_auth_rules( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> impl Future, FetchAuthRulesError>> + Send; } @@ -132,6 +132,93 @@ struct JwkSet<'a> { keys: Vec<&'a RawValue>, } +/// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs. +/// Returns `None` and log a warning if there are any errors. +async fn fetch_jwks( + client: &reqwest_middleware::ClientWithMiddleware, + jwks_url: url::Url, +) -> Option { + let req = client.get(jwks_url.clone()); + // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. + let resp = req.send().await.and_then(|r| { + r.error_for_status() + .map_err(reqwest_middleware::Error::Reqwest) + }); + + let resp = match resp { + Ok(r) => r, + // TODO: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs"); + return None; + } + }; + + let resp: http::Response = resp.into(); + + let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + let jwks = match serde_json::from_slice::(&bytes) { + Ok(jwks) => jwks, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + // `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need. + // + // Even though we limit our responses to 64KiB, we could still receive a payload like + // `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB. + // Pre-allocating the corresponding `Vec::::with_capacity(30000)` uses 8.2MiB. + let mut keys = vec![]; + + let mut failed = 0; + for key in jwks.keys { + let key = match serde_json::from_str::(key.get()) { + Ok(key) => key, + Err(e) => { + tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK"); + failed += 1; + continue; + } + }; + + // if `use` (called `cls` in rust) is specified to be something other than signing, + // we can skip storing it. + if key + .prm + .cls + .as_ref() + .is_some_and(|c| *c != jose_jwk::Class::Signing) + { + continue; + } + + keys.push(key); + } + + keys.shrink_to_fit(); + + if failed > 0 { + tracing::warn!(url=?jwks_url, failed, "could not decode JWKs"); + } + + if keys.is_empty() { + tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body"); + return None; + } + + Some(jose_jwk::JwkSet { keys }) +} + impl JwkCacheEntryLock { async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { JwkRenewalPermit::acquire_permit(self).await @@ -144,7 +231,7 @@ impl JwkCacheEntryLock { async fn renew_jwks( &self, _permit: JwkRenewalPermit<'_>, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, auth_rules: &F, @@ -166,87 +253,15 @@ impl JwkCacheEntryLock { // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { - let req = client.get(rule.jwks_url.clone()); - // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. - // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. - match req.send().await.and_then(|r| { - r.error_for_status() - .map_err(reqwest_middleware::Error::Reqwest) - }) { - // todo: should we re-insert JWKs if we want to keep this JWKs URL? - // I expect these failures would be quite sparse. - Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), - Ok(r) => { - let resp: http::Response = r.into(); - - let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE) - .await - { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - continue; - } - }; - - match serde_json::from_slice::(&bytes) { - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - } - Ok(jwks) => { - // size_of::<&RawValue>() == 16 - // size_of::() == 288 - // better to not pre-allocate this as it might be pretty large - especially if it has many - // keys we don't want or need. - // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}` - // this would consume 8MiB just like that! - let mut keys = vec![]; - let mut failed = 0; - for key in jwks.keys { - match serde_json::from_str::(key.get()) { - Ok(key) => { - // if `use` (called `cls` in rust) is specified to be something other than signing, - // we can skip storing it. - if key - .prm - .cls - .as_ref() - .is_some_and(|c| *c != jose_jwk::Class::Signing) - { - continue; - } - - keys.push(key); - } - Err(e) => { - tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK"); - failed += 1; - } - } - } - keys.shrink_to_fit(); - - if failed > 0 { - tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs"); - } - - if keys.is_empty() { - tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body"); - continue; - } - - let jwks = jose_jwk::JwkSet { keys }; - key_sets.insert( - rule.id, - KeySet { - jwks, - audience: rule.audience, - role_names: rule.role_names, - }, - ); - } - }; - } + if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await { + key_sets.insert( + rule.id, + KeySet { + jwks, + audience: rule.audience, + role_names: rule.role_names, + }, + ); } } @@ -261,7 +276,7 @@ impl JwkCacheEntryLock { async fn get_or_update_jwk_cache( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, fetch: &F, @@ -314,7 +329,7 @@ impl JwkCacheEntryLock { async fn check_jwt( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, jwt: &str, client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, @@ -409,7 +424,7 @@ impl JwkCacheEntryLock { impl JwkCache { pub(crate) async fn check_jwt( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, role_name: &RoleName, fetch: &F, @@ -941,7 +956,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { Ok(self.0.clone()) @@ -1039,7 +1054,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL for token in &tokens { jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), role, &fetch, @@ -1097,7 +1112,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), &role_name, &fetch, @@ -1136,7 +1151,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let ep = EndpointId::from("ep"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt) .await @@ -1175,7 +1190,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL // this role_name is not accepted let bad_role_name = RoleName::from("cloud_admin"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = jwk_cache .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt) .await @@ -1268,7 +1283,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let ep = EndpointId::from("ep"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); for test in table { let jwt = new_custom_ec_jwt("1".into(), &key, test.body); @@ -1336,7 +1351,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL jwk_cache .check_jwt( - &RequestMonitoring::test(), + &RequestContext::test(), endpoint.clone(), &role_name, &fetch, diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index f9cb085daf..32e0f53615 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -7,7 +7,7 @@ use super::jwt::{AuthRule, FetchAuthRules}; use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; use crate::compute_ctl::ComputeCtlApi; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; use crate::http; @@ -56,7 +56,7 @@ pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::c impl FetchAuthRules for StaticAuthRules { async fn fetch_auth_rules( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 242fe99de2..7e1b26a11a 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -6,7 +6,6 @@ pub mod local; use std::net::IpAddr; use std::sync::Arc; -use std::time::Duration; pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; @@ -14,13 +13,13 @@ use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; use crate::config::AuthenticationConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::{ @@ -30,7 +29,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{scram, stream}; @@ -192,25 +191,10 @@ impl MaskedIp { // This can't be just per IP because that would limit some PaaS that share IP addresses pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; -impl RateBucketInfo { - /// All of these are per endpoint-maskedip pair. - /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). - /// - /// First bucket: 1000mcpus total per endpoint-ip pair - /// * 4096000 requests per second with 1 hash rounds. - /// * 1000 requests per second with 4096 hash rounds. - /// * 6.8 requests per second with 600000 hash rounds. - pub const DEFAULT_AUTH_SET: [Self; 3] = [ - Self::new(1000 * 4096, Duration::from_secs(1)), - Self::new(600 * 4096, Duration::from_secs(60)), - Self::new(300 * 4096, Duration::from_secs(600)), - ]; -} - impl AuthenticationConfig { pub(crate) fn check_rate_limit( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, @@ -265,7 +249,7 @@ impl AuthenticationConfig { /// /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( - ctx: &RequestMonitoring, + ctx: &RequestContext, api: &impl control_plane::ControlPlaneApi, user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, @@ -286,7 +270,7 @@ async fn auth_quirks( Ok(info) => (info, None), }; - info!("fetching user's authentication info"); + debug!("fetching user's authentication info"); let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list @@ -343,7 +327,7 @@ async fn auth_quirks( } async fn authenticate_with_secret( - ctx: &RequestMonitoring, + ctx: &RequestContext, secret: AuthSecret, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -396,7 +380,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub(crate) async fn authenticate( self, - ctx: &RequestMonitoring, + ctx: &RequestContext, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -404,7 +388,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { - info!( + debug!( user = &*user_info.user, project = user_info.endpoint(), "performing authentication using the console" @@ -427,6 +411,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { } }; + // TODO: replace with some metric info!("user successfully authenticated"); Ok(res) } @@ -435,7 +420,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, @@ -445,7 +430,7 @@ impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { match self { Self::ControlPlane(api, user_info) => { @@ -460,7 +445,7 @@ impl Backend<'_, ComputeUserInfo> { impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, @@ -496,7 +481,7 @@ mod tests { use crate::auth::backend::MaskedIp; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; - use crate::context::RequestMonitoring; + use crate::context::RequestContext; use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret}; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; @@ -512,7 +497,7 @@ mod tests { impl control_plane::ControlPlaneApi for Auth { async fn get_role_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) @@ -520,7 +505,7 @@ mod tests { async fn get_allowed_ips_and_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result< (CachedAllowedIps, Option), @@ -534,7 +519,7 @@ mod tests { async fn get_endpoint_jwks( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _endpoint: crate::types::EndpointId, ) -> Result, control_plane::errors::GetEndpointJwksError> { @@ -543,7 +528,7 @@ mod tests { async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, ) -> Result { unimplemented!() @@ -622,7 +607,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -699,7 +684,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), @@ -751,7 +736,7 @@ mod tests { let (mut client, server) = tokio::io::duplex(1024); let mut stream = PqStream::new(Stream::from_raw(server)); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let api = Auth { ips: vec![], secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ddecae6af5..f6bce9f2d8 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -7,10 +7,10 @@ use std::str::FromStr; use itertools::Itertools; use pq_proto::StartupMessageParams; use thiserror::Error; -use tracing::{info, warn}; +use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, SniKind}; use crate::proxy::NeonOptions; @@ -86,7 +86,7 @@ pub(crate) fn endpoint_sni( impl ComputeUserInfoMaybeEndpoint { pub(crate) fn parse( - ctx: &RequestMonitoring, + ctx: &RequestContext, params: &StartupMessageParams, sni: Option<&str>, common_names: Option<&HashSet>, @@ -147,22 +147,22 @@ impl ComputeUserInfoMaybeEndpoint { } let metrics = Metrics::get(); - info!(%user, "credentials"); + debug!(%user, "credentials"); if sni.is_some() { - info!("Connection with sni"); + debug!("Connection with sni"); metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { metrics .proxy .accepted_connections_by_sni .inc(SniKind::NoSni); - info!("Connection without sni"); + debug!("Connection without sni"); } else { metrics .proxy .accepted_connections_by_sni .inc(SniKind::PasswordHack); - info!("Connection with password hack"); + debug!("Connection with password hack"); } let options = NeonOptions::parse_params(params); @@ -260,7 +260,7 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -275,7 +275,7 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id, None); @@ -290,7 +290,7 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); @@ -307,7 +307,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -322,7 +322,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); @@ -340,7 +340,7 @@ mod tests { ), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -355,7 +355,7 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; assert_eq!(user_info.user, "john_doe"); assert!(user_info.endpoint_id.is_none()); @@ -370,7 +370,7 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.user, "john_doe"); @@ -385,14 +385,14 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); @@ -408,7 +408,7 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .expect_err("should fail"); match err { @@ -427,7 +427,7 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) .expect_err("should fail"); match err { @@ -447,7 +447,7 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 6294549ff6..9c6ce151cb 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -11,7 +11,7 @@ use tracing::info; use super::backend::ComputeCredentialKeys; use super::{AuthError, PasswordHackPayload}; use crate::config::TlsServerEndPoint; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::sasl; @@ -32,7 +32,7 @@ pub(crate) struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. pub(crate) struct Scram<'a>( pub(crate) &'a scram::ServerSecret, - pub(crate) &'a RequestMonitoring, + pub(crate) &'a RequestContext, ); impl AuthMethod for Scram<'_> { @@ -178,6 +178,8 @@ impl AuthFlow<'_, S, Scram<'_>> { SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), _ => {} } + + // TODO: make this a metric instead info!("client chooses {}", sasl.method); let outcome = sasl::SaslStream::new(self.stream, sasl.message) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index fbdb1dec15..968682cf0f 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -32,11 +32,12 @@ project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; +use thiserror::Error; use tokio::net::TcpListener; use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; @@ -110,7 +111,7 @@ struct SqlOverHttpArgs { sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: u64, + sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, @@ -124,8 +125,9 @@ async fn main() -> anyhow::Result<()> { Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - info!("Version: {GIT_VERSION}"); - info!("Build_tag: {BUILD_TAG}"); + // TODO: refactor these to use labels + debug!("Version: {GIT_VERSION}"); + debug!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { revision: GIT_VERSION, build_tag: BUILD_TAG, @@ -305,26 +307,46 @@ fn build_auth_backend( Ok(Box::leak(Box::new(auth_backend))) } +#[derive(Error, Debug)] +enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), +} + async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + let mut init = true; loop { rx.notified().await; match refresh_config_inner(&path).await { Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } Err(e) => { error!(error=?e, ?path, "could not read config file"); } } + + init = false; } } -async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { +async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { let bytes = tokio::fs::read(&path).await?; let data: LocalProxySpec = serde_json::from_slice(&bytes)?; let mut jwks_set = vec![]; - for jwks in data.jwks.into_iter().flatten() { + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; ensure!( @@ -367,7 +389,7 @@ async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { } } - jwks_set.push(JwksSettings { + Ok(JwksSettings { id: jwks.id, jwks_url, provider_name: jwks.provider_name, @@ -381,6 +403,10 @@ async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { }) } + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + info!("successfully loaded new config"); JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index ef5b5e8509..623a0fd3b2 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -11,7 +11,7 @@ use futures::future::Either; use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; -use proxy::context::RequestMonitoring; +use proxy::context::RequestContext; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::protocol2::ConnectionInfo; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; @@ -177,7 +177,7 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, ConnectionInfo { addr: peer_addr, @@ -208,7 +208,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( - ctx: &RequestMonitoring, + ctx: &RequestContext, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -259,7 +259,7 @@ async fn ssl_handshake( } async fn handle_client( - ctx: RequestMonitoring, + ctx: RequestContext, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index fda5b25961..a935378162 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -276,7 +276,7 @@ struct SqlOverHttpArgs { sql_over_http_cancel_set_shards: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: u64, + sql_over_http_max_request_size_bytes: usize, #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB sql_over_http_max_response_size_bytes: usize, @@ -288,6 +288,7 @@ async fn main() -> anyhow::Result<()> { let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + // TODO: refactor these to use labels info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { @@ -427,8 +428,9 @@ async fn main() -> anyhow::Result<()> { )?))), None => None, }; + let cancellation_handler = Arc::new(CancellationHandler::< - Option>>, + Option>>, >::new( cancel_map.clone(), redis_publisher, diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 07769e053c..20db1fbb14 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken; use tracing::info; use crate::config::EndpointCacheConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; @@ -75,7 +75,7 @@ impl EndpointsCache { } } - pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { + pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { // the endpoint cache is not yet fully initialised. return true; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index db0970adcb..4b72a66e63 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -7,19 +7,26 @@ use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; use tokio_postgres::{CancelToken, NoTls}; -use tracing::info; +use tracing::{debug, info}; use uuid::Uuid; +use crate::auth::{check_peer_addr_is_in_list, IpPattern}; use crate::error::ReportableError; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; +use std::net::IpAddr; + +use ipnet::{IpNet, Ipv4Net, Ipv6Net}; pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; pub(crate) type CancellationHandlerMainInternal = Option>>; +type IpSubnetKey = IpNet; + /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. @@ -29,14 +36,23 @@ pub struct CancellationHandler

{ /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. from: CancellationSource, + // rate limiter of cancellation requests + limiter: Arc>>, } #[derive(Debug, Error)] pub(crate) enum CancelError { #[error("{0}")] IO(#[from] std::io::Error), + #[error("{0}")] Postgres(#[from] tokio_postgres::Error), + + #[error("rate limit exceeded")] + RateLimit, + + #[error("IP is not allowed")] + IpNotAllowed, } impl ReportableError for CancelError { @@ -47,6 +63,8 @@ impl ReportableError for CancelError { crate::error::ErrorKind::Postgres } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + CancelError::RateLimit => crate::error::ErrorKind::RateLimit, + CancelError::IpNotAllowed => crate::error::ErrorKind::User, } } } @@ -73,19 +91,42 @@ impl CancellationHandler

{ break key; }; - info!("registered new query cancellation key {key}"); + debug!("registered new query cancellation key {key}"); Session { key, cancellation_handler: self, } } + /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. + /// check_allowed - if true, check if the IP is allowed to cancel the query pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, + peer_addr: &IpAddr, + check_allowed: bool, ) -> Result<(), CancelError> { + // TODO: check for unspecified address is only for backward compatibility, should be removed + if !peer_addr.is_unspecified() { + let subnet_key = match *peer_addr { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock().unwrap().check(subnet_key, 1) { + tracing::debug!("Rate limit exceeded. Skipping cancellation message"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); + } + } + // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); @@ -96,7 +137,13 @@ impl CancellationHandler

{ source: self.from, kind: crate::metrics::CancellationOutcome::NotFound, }); - match self.client.try_publish(key, session_id).await { + + if session_id == Uuid::nil() { + // was already published, do not publish it again + return Ok(()); + } + + match self.client.try_publish(key, session_id, *peer_addr).await { Ok(()) => {} // do nothing Err(e) => { return Err(CancelError::IO(std::io::Error::new( @@ -107,6 +154,13 @@ impl CancellationHandler

{ } return Ok(()); }; + + if check_allowed + && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice()) + { + return Err(CancelError::IpNotAllowed); + } + Metrics::get() .proxy .cancellation_requests_total @@ -135,13 +189,29 @@ impl CancellationHandler<()> { map, client: (), from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), } } } impl CancellationHandler>>> { pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { - Self { map, client, from } + Self { + map, + client, + from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), + } } } @@ -152,22 +222,31 @@ impl CancellationHandler>>> { pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, + ip_allowlist: Vec, } impl CancelClosure { - pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { + pub(crate) fn new( + socket_addr: SocketAddr, + cancel_token: CancelToken, + ip_allowlist: Vec, + ) -> Self { Self { socket_addr, cancel_token, + ip_allowlist, } } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; - info!("query was cancelled"); + debug!("query was cancelled"); Ok(()) } + pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { + self.ip_allowlist = ip_allowlist; + } } /// Helper for registering query cancellation tokens. @@ -182,7 +261,7 @@ impl

Session

{ /// Store the cancel token for the given session. /// This enables query cancellation in `crate::proxy::prepare_client_connection`. pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { - info!("enabling query cancellation for this session"); + debug!("enabling query cancellation for this session"); self.cancellation_handler .map .insert(self.key, Some(cancel_closure)); @@ -194,7 +273,7 @@ impl

Session

{ impl

Drop for Session

{ fn drop(&mut self) { self.cancellation_handler.map.remove(&self.key); - info!("dropped query cancellation key {}", &self.key); + debug!("dropped query cancellation key {}", &self.key); } } @@ -229,6 +308,8 @@ mod tests { cancel_key: 0, }, Uuid::new_v4(), + &("127.0.0.1".parse().unwrap()), + true, ) .await .unwrap(); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ca4a348ed8..8408d4720b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -14,11 +14,11 @@ use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; @@ -213,7 +213,7 @@ impl ConnCfg { }; let connect_once = |host, port| { - info!("trying to connect to compute node at {host}:{port}"); + debug!("trying to connect to compute node at {host}:{port}"); connect_with_timeout(host, port).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. @@ -286,7 +286,7 @@ impl ConnCfg { /// Connect to a corresponding compute node. pub(crate) async fn connect( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, @@ -328,6 +328,7 @@ impl ConnCfg { tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); + // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", @@ -341,7 +342,7 @@ impl ConnCfg { // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]); let connection = PostgresConnection { stream, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index b048c9d389..8bc8e3f96f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -64,7 +64,7 @@ pub struct HttpConfig { pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, - pub max_request_size_bytes: u64, + pub max_request_size_bytes: usize, pub max_response_size_bytes: usize, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index cc456f3667..fbd0c8e5c5 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -8,7 +8,7 @@ use tracing::{debug, error, info, Instrument}; use crate::auth::backend::ConsoleRedirectBackend; use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; @@ -82,7 +82,7 @@ pub async fn task_main( } }; - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, peer_addr, crate::metrics::Protocol::Tcp, @@ -141,12 +141,12 @@ pub async fn task_main( pub(crate) async fn handle_client( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, - ctx: &RequestMonitoring, + ctx: &RequestContext, cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!( + debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -156,16 +156,21 @@ pub(crate) async fn handle_client( let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); - let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } @@ -174,7 +179,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let user_info = match backend + let (user_info, ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -198,6 +203,8 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; + node.cancel_closure + .set_ip_allowlist(ip_allowlist.unwrap_or_default()); let session = cancellation_handler.get_session(); prepare_client_connection(&node, &session, &mut stream).await?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 6cf99c0c97..6d2d2d51ce 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{debug, info, info_span, Span}; +use tracing::{debug, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; @@ -32,15 +32,15 @@ pub(crate) static LOG_CHAN_DISCONNECT: OnceCell, + TryLock, ); -struct RequestMonitoringInner { +struct RequestContextInner { pub(crate) conn_info: ConnectionInfo, pub(crate) session_id: Uuid, pub(crate) protocol: Protocol, @@ -81,10 +81,10 @@ pub(crate) enum AuthMethod { Cleartext, } -impl Clone for RequestMonitoring { +impl Clone for RequestContext { fn clone(&self) -> Self { let inner = self.0.try_lock().expect("should not deadlock"); - let new = RequestMonitoringInner { + let new = RequestContextInner { conn_info: inner.conn_info.clone(), session_id: inner.session_id, protocol: inner.protocol, @@ -115,13 +115,14 @@ impl Clone for RequestMonitoring { } } -impl RequestMonitoring { +impl RequestContext { pub fn new( session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol, region: &'static str, ) -> Self { + // TODO: be careful with long lived spans let span = info_span!( "connect_request", %protocol, @@ -131,7 +132,7 @@ impl RequestMonitoring { role = tracing::field::Empty, ); - let inner = RequestMonitoringInner { + let inner = RequestContextInner { conn_info, session_id, protocol, @@ -167,7 +168,7 @@ impl RequestMonitoring { let ip = IpAddr::from([127, 0, 0, 1]); let addr = SocketAddr::new(ip, 5432); let conn_info = ConnectionInfo { addr, extra: None }; - RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") + RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") } pub(crate) fn console_application_name(&self) -> String { @@ -324,7 +325,7 @@ impl RequestMonitoring { } pub(crate) struct LatencyTimerPause<'a> { - ctx: &'a RequestMonitoring, + ctx: &'a RequestContext, start: tokio::time::Instant, waiting_for: Waiting, } @@ -340,7 +341,7 @@ impl Drop for LatencyTimerPause<'_> { } } -impl RequestMonitoringInner { +impl RequestContextInner { fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -384,6 +385,10 @@ impl RequestMonitoringInner { } else { ConnectOutcome::Failed }; + + // TODO: get rid of entirely/refactor + // check for false positives + // AND false negatives if let Some(rejected) = self.rejected { let ep = self .endpoint_id @@ -391,7 +396,7 @@ impl RequestMonitoringInner { .map(|x| x.as_str()) .unwrap_or_default(); // This makes sense only if cache is disabled - info!( + debug!( ?outcome, ?rejected, ?ep, @@ -425,7 +430,7 @@ impl RequestMonitoringInner { } } -impl Drop for RequestMonitoringInner { +impl Drop for RequestContextInner { fn drop(&mut self) { if self.sender.is_some() { self.log_connect(); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 4112de646f..9bf3a275bb 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -20,7 +20,7 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use super::{RequestMonitoringInner, LOG_CHAN}; +use super::{RequestContextInner, LOG_CHAN}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; @@ -117,8 +117,8 @@ impl serde::Serialize for Options<'_> { } } -impl From<&RequestMonitoringInner> for RequestData { - fn from(value: &RequestMonitoringInner) -> Self { +impl From<&RequestContextInner> for RequestData { + fn from(value: &RequestContextInner) -> Self { Self { session_id: value.session_id, peer_addr: value.conn_info.addr.ip().to_string(), diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index fd333d2aac..500acad50f 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -13,7 +13,7 @@ use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; use crate::cache::Cached; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret}; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, @@ -206,7 +206,7 @@ impl super::ControlPlaneApi for MockControlPlane { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( @@ -216,7 +216,7 @@ impl super::ControlPlaneApi for MockControlPlane { async fn get_allowed_ips_and_secret( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { Ok(( @@ -229,7 +229,7 @@ impl super::ControlPlaneApi for MockControlPlane { async fn get_endpoint_jwks( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await @@ -238,7 +238,7 @@ impl super::ControlPlaneApi for MockControlPlane { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _user_info: &ComputeUserInfo, ) -> Result { self.do_wake_compute().map_ok(Cached::new_uncached).await diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index e388d8a538..f8f74372f0 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -8,14 +8,14 @@ use std::time::Duration; use dashmap::DashMap; use tokio::time::Instant; -use tracing::info; +use tracing::{debug, info}; use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::auth::backend::ComputeUserInfo; use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::{ errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, }; @@ -41,7 +41,7 @@ pub enum ControlPlaneClient { impl ControlPlaneApi for ControlPlaneClient { async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { match self { @@ -57,7 +57,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { match self { @@ -71,7 +71,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError> { match self { @@ -85,7 +85,7 @@ impl ControlPlaneApi for ControlPlaneClient { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { match self { @@ -214,7 +214,7 @@ impl ApiLocks { self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); - info!("acquired permit {:?}", now.elapsed().as_secs_f64()); + debug!("acquired permit {:?}", now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: permit? }) } @@ -271,7 +271,7 @@ impl WakeComputePermit { impl FetchAuthRules for ControlPlaneClient { async fn fetch_auth_rules( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, FetchAuthRulesError> { self.get_endpoint_jwks(ctx, endpoint) diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 26ff4e1402..757ea6720a 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -14,7 +14,7 @@ use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeComput use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, @@ -65,7 +65,7 @@ impl NeonControlPlaneClient { async fn do_get_auth_info( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { if !self @@ -73,6 +73,8 @@ impl NeonControlPlaneClient { .endpoints_cache .is_valid(ctx, &user_info.endpoint.normalize()) { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } @@ -92,7 +94,7 @@ impl NeonControlPlaneClient { ]) .build()?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; @@ -104,10 +106,12 @@ impl NeonControlPlaneClient { // TODO(anna): retry Err(e) => { return if e.get_reason().is_not_found() { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. Ok(AuthInfo::default()) } else { Err(e.into()) - } + }; } }; @@ -137,7 +141,7 @@ impl NeonControlPlaneClient { async fn do_get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { if !self @@ -163,7 +167,7 @@ impl NeonControlPlaneClient { .build() .map_err(GetEndpointJwksError::RequestBuild)?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self @@ -196,7 +200,7 @@ impl NeonControlPlaneClient { async fn do_wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let request_id = ctx.session_id().to_string(); @@ -220,7 +224,7 @@ impl NeonControlPlaneClient { let request = request_builder.build()?; - info!(url = request.url().as_str(), "sending http request"); + debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; @@ -249,6 +253,7 @@ impl NeonControlPlaneClient { Ok(node) } .map_err(crate::error::log_error) + // TODO: redo this span stuff .instrument(info_span!("http", id = request_id)) .await } @@ -258,7 +263,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); @@ -292,7 +297,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let normalized_ep = &user_info.endpoint.normalize(); @@ -334,7 +339,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await @@ -343,7 +348,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result { let key = user_info.endpoint_cache_key(); @@ -375,6 +380,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { + // TODO: if there is something in the cache, mark the permit as success. check_cache!(); } diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 70607ac0d0..41972e4e44 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -17,7 +17,7 @@ use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; use crate::intern::ProjectIdInt; use crate::types::{EndpointCacheKey, EndpointId}; @@ -75,7 +75,7 @@ pub(crate) struct NodeInfo { impl NodeInfo { pub(crate) async fn connect( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, timeout: Duration, ) -> Result { self.config @@ -116,26 +116,26 @@ pub(crate) trait ControlPlaneApi { /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips_and_secret( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; async fn get_endpoint_jwks( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, ) -> Result; } diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index b1642cedb3..ed88c77256 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -122,18 +122,18 @@ impl Endpoint { } #[derive(Error, Debug)] -pub(crate) enum ReadBodyError { +pub(crate) enum ReadBodyError { #[error("Content length exceeds limit of {limit} bytes")] BodyTooLarge { limit: usize }, #[error(transparent)] - Read(#[from] reqwest::Error), + Read(#[from] E), } -pub(crate) async fn read_body_with_limit( - mut b: impl Body + Unpin, +pub(crate) async fn read_body_with_limit( + mut b: impl Body + Unpin, limit: usize, -) -> Result, ReadBodyError> { +) -> Result, ReadBodyError> { // We could use `b.limited().collect().await.to_bytes()` here // but this ends up being slightly more efficient as far as I can tell. diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index 0fae78b60c..9888458ee2 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -38,7 +38,7 @@ where impl MetricRecorder { pub fn new() -> Result { - tracing::info!( + tracing::debug!( config = config::malloc_conf::read()?, version = version::read()?, "starting jemalloc recorder" diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f91fcd4120..659c57c865 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -351,6 +351,7 @@ pub enum CancellationSource { pub enum CancellationOutcome { NotFound, Found, + RateLimitExceeded, } #[derive(LabelGroup)] diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 659b7afa68..2e759b0894 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -7,7 +7,7 @@ use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::ComputeCredentialKeys; use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; use crate::config::RetryConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::locks::ApiLocks; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; @@ -47,7 +47,7 @@ pub(crate) trait ConnectMechanism { type Error: From; async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result; @@ -59,7 +59,7 @@ pub(crate) trait ConnectMechanism { pub(crate) trait ComputeConnectBackend { async fn wake_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, ) -> Result; fn get_keys(&self) -> &ComputeCredentialKeys; @@ -82,7 +82,7 @@ impl ConnectMechanism for TcpMechanism<'_> { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result { @@ -99,7 +99,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] pub(crate) async fn connect_to_compute( - ctx: &RequestMonitoring, + ctx: &RequestContext, mechanism: &M, user_info: &B, allow_self_signed_compute: bool, @@ -117,7 +117,6 @@ where node_info.set_keys(user_info.get_keys()); node_info.allow_self_signed_compute = allow_self_signed_compute; mechanism.update_connect_config(&mut node_info.config); - let retry_type = RetryType::ConnectToCompute; // try once let err = match mechanism @@ -129,7 +128,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); @@ -147,7 +146,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); @@ -156,8 +155,9 @@ where node_info } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); + debug!("compute node's state has likely changed; requesting a wake-up"); let old_node_info = invalidate_cache(node_info); + // TODO: increment num_retries? let mut node_info = wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.reuse_settings(old_node_info); @@ -169,7 +169,7 @@ where // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet - info!("wake_compute success. attempting to connect"); + debug!("wake_compute success. attempting to connect"); num_retries = 1; loop { match mechanism @@ -181,10 +181,11 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); + // TODO: is this necessary? We have a metric. info!(?num_retries, "connected to compute node after"); return Ok(res); } @@ -194,7 +195,7 @@ where Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::ConnectToCompute, }, num_retries.into(), ); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 91a3ceff75..4e4af88634 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -87,6 +87,8 @@ where transfer_one_direction(cx, &mut compute_to_client, compute, client) .map_err(ErrorSource::from_compute)?; + // TODO: 1 info log, with a enum label for close direction. + // Early termination checks from compute to client. if let TransferState::Done(_) = compute_to_client { if let TransferState::Running(buf) = &client_to_compute { diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index a67f1b8112..e27c211932 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -5,11 +5,11 @@ use pq_proto::{ }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; use crate::auth::endpoint_sni; use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::proxy::ERR_INSECURE_CONNECTION; @@ -66,7 +66,7 @@ pub(crate) enum HandshakeData { /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] pub(crate) async fn handshake( - ctx: &RequestMonitoring, + ctx: &RequestContext, stream: S, mut tls: Option<&TlsConfig>, record_handshake_error: bool, @@ -199,6 +199,8 @@ pub(crate) async fn handshake( .await?; } + // This log highlights the start of the connection. + // This contains useful information for debugging, not logged elsewhere, like role name and endpoint id. info!( ?version, ?params, @@ -211,7 +213,7 @@ pub(crate) async fn handshake( FeStartupPacket::StartupMessage { params, version } if version.major() == 3 && version > PG_PROTOCOL_LATEST => { - warn!(?version, "unsupported minor version"); + debug!(?version, "unsupported minor version"); // no protocol extensions are supported. // @@ -233,14 +235,16 @@ pub(crate) async fn handshake( info!( ?version, + ?params, session_type = "normal", "successful handshake; unsupported minor version requested" ); break Ok(HandshakeData::Startup(stream, params)); } - FeStartupPacket::StartupMessage { version, .. } => { + FeStartupPacket::StartupMessage { version, params } => { warn!( ?version, + ?params, session_type = "normal", "unsuccessful handshake; unsupported version" ); diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 17721c23d5..5d9468d89a 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -25,7 +25,7 @@ use self::connect_compute::{connect_to_compute, TcpMechanism}; use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; @@ -117,7 +117,7 @@ pub async fn task_main( } }; - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Tcp, @@ -247,14 +247,14 @@ impl ReportableError for ClientRequestError { pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, - ctx: &RequestMonitoring, + ctx: &RequestContext, cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!( + debug!( protocol = %ctx.protocol(), "handling interactive connection from client" ); @@ -268,12 +268,18 @@ pub(crate) async fn handle_client( let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index e3b4730982..5e07c8eeae 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,5 +1,5 @@ use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::debug; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; @@ -45,7 +45,7 @@ pub(crate) async fn proxy_pass( ); // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); + debug!("performing the proxy pass..."); let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( &mut client, &mut compute, diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index df9f79a7e3..fe211adfeb 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -36,7 +36,7 @@ async fn proxy_mitm( // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; let (end_client, startup) = match handshake( - &RequestMonitoring::test(), + &RequestContext::test(), client1, Some(&server_config1), false, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index be821925b5..3de8ca8736 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -162,7 +162,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0, &RequestMonitoring::test())) + .begin(auth::Scram(&self.0, &RequestContext::test())) .await? .authenticate() .await?; @@ -182,11 +182,10 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let (client, _) = read_proxy_protocol(client).await?; - let mut stream = - match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? { - HandshakeData::Startup(stream, _) => stream, - HandshakeData::Cancel(_) => bail!("cancellation not supported"), - }; + let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -466,7 +465,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, - _ctx: &RequestMonitoring, + _ctx: &RequestContext, _node_info: &control_plane::CachedNodeInfo, _timeout: std::time::Duration, ) -> Result { @@ -581,7 +580,7 @@ fn helper_create_connect_info( async fn connect_to_compute_success() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -599,7 +598,7 @@ async fn connect_to_compute_success() { async fn connect_to_compute_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -618,7 +617,7 @@ async fn connect_to_compute_retry() { async fn connect_to_compute_non_retry_1() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -637,7 +636,7 @@ async fn connect_to_compute_non_retry_1() { async fn connect_to_compute_non_retry_2() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -657,7 +656,7 @@ async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); tokio::time::pause(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); @@ -689,7 +688,7 @@ async fn connect_to_compute_non_retry_3() { async fn wake_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { @@ -708,7 +707,7 @@ async fn wake_retry() { async fn wake_non_retry() { let _ = env_logger::try_init(); use ConnectAction::*; - let ctx = RequestMonitoring::test(); + let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); let config = RetryConfig { diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index f9f46bb66c..8a672d48dc 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -2,7 +2,7 @@ use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::CachedNodeInfo; use crate::error::ReportableError; @@ -13,11 +13,10 @@ use crate::proxy::retry::{retry_after, should_retry}; pub(crate) async fn wake_compute( num_retries: &mut u32, - ctx: &RequestMonitoring, + ctx: &RequestContext, api: &B, config: RetryConfig, ) -> Result { - let retry_type = RetryType::WakeCompute; loop { match api.wake_compute(ctx).await { Err(e) if !should_retry(&e, *num_retries, config) => { @@ -26,7 +25,7 @@ pub(crate) async fn wake_compute( Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, - retry_type, + retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); @@ -40,10 +39,12 @@ pub(crate) async fn wake_compute( Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Success, - retry_type, + retry_type: RetryType::WakeCompute, }, (*num_retries).into(), ); + // TODO: is this necessary? We have a metric. + // TODO: this log line is misleading as "wake_compute" might return cached (and stale) info. info!(?num_retries, "compute node woken up after"); return Ok(n); } diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 16c398f303..b74a9ab17e 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -195,7 +195,11 @@ impl DynamicLimiter { /// /// Set the outcome to `None` to ignore the job. fn release_inner(&self, start: Instant, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); + if outcome.is_none() { + tracing::warn!("outcome is {:?}", outcome); + } else { + tracing::debug!("outcome is {:?}", outcome); + } if self.config.initial_limit == 0 { return; } diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 5332a5184f..3000cc4c2a 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -31,26 +31,32 @@ impl LimitAlgorithm for Aimd { if utilisation > self.utilisation { let limit = old_limit + self.inc; - let increased_limit = limit.clamp(self.min, self.max); - if increased_limit > old_limit { - tracing::info!(increased_limit, "limit increased"); + let new_limit = limit.clamp(self.min, self.max); + if new_limit > old_limit { + tracing::info!(old_limit, new_limit, "limit increased"); + } else { + tracing::debug!(old_limit, new_limit, "limit clamped at max"); } - increased_limit + new_limit } else { old_limit } } Outcome::Overload => { - let limit = old_limit as f32 * self.dec; + let new_limit = old_limit as f32 * self.dec; // Floor instead of round, so the limit reduces even with small numbers. // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; + let new_limit = new_limit.floor() as usize; - let limit = limit.clamp(self.min, self.max); - tracing::info!(limit, "limit decreased"); - limit + let new_limit = new_limit.clamp(self.min, self.max); + if new_limit < old_limit { + tracing::info!(old_limit, new_limit, "limit decreased"); + } else { + tracing::debug!(old_limit, new_limit, "limit clamped at min"); + } + new_limit } } } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 4259fd04f4..a048721e77 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -14,13 +14,13 @@ use tracing::info; use crate::intern::EndpointIdInt; -pub(crate) struct GlobalRateLimiter { +pub struct GlobalRateLimiter { data: Vec, info: Vec, } impl GlobalRateLimiter { - pub(crate) fn new(info: Vec) -> Self { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -34,7 +34,7 @@ impl GlobalRateLimiter { } /// Check that number of connections is below `max_rps` rps. - pub(crate) fn check(&mut self) -> bool { + pub fn check(&mut self) -> bool { let now = Instant::now(); let should_allow_request = self @@ -137,6 +137,19 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; + pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 3ae2ecaf8f..5f90102da3 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -8,5 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd; pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub(crate) use limiter::GlobalRateLimiter; -pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 0000246971..633a2f1b81 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use core::net::IpAddr; use pq_proto::CancelKeyData; use redis::AsyncCommands; use tokio::sync::Mutex; @@ -15,6 +16,7 @@ pub trait CancellationPublisherMut: Send + Sync + 'static { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -24,6 +26,7 @@ pub trait CancellationPublisher: Send + Sync + 'static { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -32,6 +35,7 @@ impl CancellationPublisher for () { &self, _cancel_key_data: CancelKeyData, _session_id: Uuid, + _peer_addr: IpAddr, ) -> anyhow::Result<()> { Ok(()) } @@ -42,8 +46,10 @@ impl CancellationPublisherMut for P { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { -

::try_publish(self, cancel_key_data, session_id).await +

::try_publish(self, cancel_key_data, session_id, peer_addr) + .await } } @@ -52,9 +58,10 @@ impl CancellationPublisher for Option

{ &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { if let Some(p) = self { - p.try_publish(cancel_key_data, session_id).await + p.try_publish(cancel_key_data, session_id, peer_addr).await } else { Ok(()) } @@ -66,10 +73,11 @@ impl CancellationPublisher for Arc> { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { self.lock() .await - .try_publish(cancel_key_data, session_id) + .try_publish(cancel_key_data, session_id, peer_addr) .await } } @@ -97,11 +105,13 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { let payload = serde_json::to_string(&Notification::Cancel(CancelSession { region_id: Some(self.region_id.clone()), cancel_key_data, session_id, + peer_addr: Some(peer_addr), }))?; let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) @@ -120,12 +130,14 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { + // TODO: review redundant error duplication logs. if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping cancellation message"); return Err(anyhow::anyhow!("Rate limit exceeded")); } - match self.publish(cancel_key_data, session_id).await { + match self.publish(cancel_key_data, session_id, peer_addr).await { Ok(()) => return Ok(()), Err(e) => { tracing::error!("failed to publish a message: {e}"); @@ -133,7 +145,7 @@ impl RedisPublisherClient { } tracing::info!("Publisher is disconnected. Reconnectiong..."); self.try_connect().await?; - self.publish(cancel_key_data, session_id).await + self.publish(cancel_key_data, session_id, peer_addr).await } } @@ -142,11 +154,15 @@ impl CancellationPublisherMut for RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { tracing::info!("publishing cancellation key to Redis"); - match self.try_publish_internal(cancel_key_data, session_id).await { + match self + .try_publish_internal(cancel_key_data, session_id, peer_addr) + .await + { Ok(()) => { - tracing::info!("cancellation key successfuly published to Redis"); + tracing::debug!("cancellation key successfuly published to Redis"); Ok(()) } Err(e) => { diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 62e7b1b565..65008ae943 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -60,6 +60,7 @@ pub(crate) struct CancelSession { pub(crate) region_id: Option, pub(crate) cancel_key_data: CancelKeyData, pub(crate) session_id: Uuid, + pub(crate) peer_addr: Option, } fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result @@ -137,10 +138,20 @@ impl MessageHandler { return Ok(()); } } + + // TODO: Remove unspecified peer_addr after the complete migration to the new format + let peer_addr = cancel_session + .peer_addr + .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED)); // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. match self .cancellation_handler - .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) + .cancel_session( + cancel_session.cancel_key_data, + uuid::Uuid::nil(), + &peer_addr, + cancel_session.peer_addr.is_some(), + ) .await { Ok(()) => {} @@ -335,6 +346,7 @@ mod tests { cancel_key_data, region_id: None, session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; @@ -344,6 +356,7 @@ mod tests { cancel_key_data, region_id: Some("region".to_string()), session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 7fc5bd236d..3037e20888 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -12,8 +12,8 @@ use tracing::field::display; use tracing::{debug, info}; use super::conn_pool::poll_client; -use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client, Send}; +use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send}; use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; @@ -23,7 +23,7 @@ use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; use crate::config::ProxyConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; @@ -33,12 +33,13 @@ use crate::intern::EndpointIdInt; use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; -use crate::types::{EndpointId, Host}; +use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; pub(crate) struct PoolingBackend { - pub(crate) http_conn_pool: Arc>, + pub(crate) http_conn_pool: Arc>>, pub(crate) local_pool: Arc>, - pub(crate) pool: Arc>, + pub(crate) pool: + Arc>>, pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -48,7 +49,7 @@ pub(crate) struct PoolingBackend { impl PoolingBackend { pub(crate) async fn authenticate_with_password( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, password: &[u8], ) -> Result { @@ -110,7 +111,7 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, user_info: &ComputeUserInfo, jwt: String, ) -> Result { @@ -161,16 +162,16 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_compute( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, ) -> Result, HttpConnError> { let maybe_client = if force_new { - info!("pool: pool is disabled"); + debug!("pool: pool is disabled"); None } else { - info!("pool: looking for an existing connection"); + debug!("pool: looking for an existing connection"); self.pool.get(ctx, &conn_info)? }; @@ -201,21 +202,24 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_local_proxy( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, ) -> Result, HttpConnError> { - info!("pool: looking for an existing connection"); + debug!("pool: looking for an existing connection"); if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); } let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + debug!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials { info: ComputeUserInfo { user: conn_info.user_info.user.clone(), - endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)), + endpoint: EndpointId::from(format!( + "{}{LOCAL_PROXY_SUFFIX}", + conn_info.user_info.endpoint.normalize() + )), options: conn_info.user_info.options.clone(), }, keys: crate::auth::backend::ComputeCredentialKeys::None, @@ -246,7 +250,7 @@ impl PoolingBackend { #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] pub(crate) async fn connect_to_local_postgres( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { @@ -471,7 +475,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError { } struct TokioMechanism { - pool: Arc>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -487,7 +491,7 @@ impl ConnectMechanism for TokioMechanism { async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { @@ -521,7 +525,7 @@ impl ConnectMechanism for TokioMechanism { } struct HyperMechanism { - pool: Arc>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -537,7 +541,7 @@ impl ConnectMechanism for HyperMechanism { async fn connect_once( &self, - ctx: &RequestMonitoring, + ctx: &RequestContext, node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 1845603bf7..bd262f45ed 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -19,9 +19,10 @@ use { }; use super::conn_pool_lib::{ - Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool, + Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool, + GlobalConnPool, }; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::Metrics; @@ -52,8 +53,8 @@ impl fmt::Display for ConnInfo { } pub(crate) fn poll_client( - global_pool: Arc>, - ctx: &RequestMonitoring, + global_pool: Arc>>, + ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, @@ -167,6 +168,7 @@ pub(crate) fn poll_client( Client::new(inner, conn_info, pool_clone) } +#[derive(Clone)] pub(crate) struct ClientDataRemote { session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -243,7 +245,7 @@ mod tests { }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, - max_request_size_bytes: u64::MAX, + max_request_size_bytes: usize::MAX, max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 61c39c32c9..fe1d2563bc 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::marker::PhantomData; use std::ops::Deref; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; @@ -15,7 +16,7 @@ use super::conn_pool::ClientDataRemote; use super::http_conn_pool::ClientDataHttp; use super::local_conn_pool::ClientDataLocal; use crate::auth::backend::ComputeUserInfo; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::types::{DbName, EndpointCacheKey, RoleName}; @@ -43,13 +44,14 @@ impl ConnInfo { } } +#[derive(Clone)] pub(crate) enum ClientDataEnum { Remote(ClientDataRemote), Local(ClientDataLocal), - #[allow(dead_code)] Http(ClientDataHttp), } +#[derive(Clone)] pub(crate) struct ClientInnerCommon { pub(crate) inner: C, pub(crate) aux: MetricsAuxInfo, @@ -91,6 +93,7 @@ pub(crate) struct ConnPoolEntry { pub(crate) struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, + /// max # connections per endpoint max_conns: usize, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, @@ -232,7 +235,7 @@ impl EndpointConnPool { // do logging outside of the mutex if returned { - info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + debug!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); } else { info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } @@ -317,24 +320,49 @@ impl DbUserConn for DbUserConnPool { } } -pub(crate) struct GlobalConnPool { +pub(crate) trait EndpointConnPoolExt { + fn clear_closed(&mut self) -> usize; + fn total_conns(&self) -> usize; +} + +impl EndpointConnPoolExt for EndpointConnPool { + fn clear_closed(&mut self) -> usize { + let mut clients_removed: usize = 0; + for db_pool in self.pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(&mut self.total_conns); + } + clients_removed + } + + fn total_conns(&self) -> usize { + self.total_conns + } +} + +pub(crate) struct GlobalConnPool +where + C: ClientInnerExt, + P: EndpointConnPoolExt, +{ // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>>, + pub(crate) global_pool: DashMap>>, /// Number of endpoint-connection pools /// /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. - global_pool_size: AtomicUsize, + pub(crate) global_pool_size: AtomicUsize, /// Total number of connections in the pool - global_connections_count: Arc, + pub(crate) global_connections_count: Arc, - config: &'static crate::config::HttpConfig, + pub(crate) config: &'static crate::config::HttpConfig, + + _marker: PhantomData, } #[derive(Debug, Clone, Copy)] @@ -357,7 +385,11 @@ pub struct GlobalConnPoolOptions { pub max_total_conns: usize, } -impl GlobalConnPool { +impl GlobalConnPool +where + C: ClientInnerExt, + P: EndpointConnPoolExt, +{ pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { @@ -365,6 +397,7 @@ impl GlobalConnPool { global_pool_size: AtomicUsize::new(0), config, global_connections_count: Arc::new(AtomicUsize::new(0)), + _marker: PhantomData, }) } @@ -378,60 +411,6 @@ impl GlobalConnPool { self.config.pool_options.idle_timeout } - pub(crate) fn get( - self: &Arc, - ctx: &RequestMonitoring, - conn_info: &ConnInfo, - ) -> Result>, HttpConnError> { - let mut client: Option> = None; - let Some(endpoint) = conn_info.endpoint_cache_key() else { - return Ok(None); - }; - - let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); - if let Some(entry) = endpoint_pool - .write() - .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } - let endpoint_pool = Arc::downgrade(&endpoint_pool); - - // ok return cached connection if found and establish a new one otherwise - if let Some(mut client) = client { - if client.inner.is_closed() { - info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - return Ok(None); - } - tracing::Span::current() - .record("conn_id", tracing::field::display(client.get_conn_id())); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - - match client.get_data() { - ClientDataEnum::Local(data) => { - data.session().send(ctx.session_id())?; - } - - ClientDataEnum::Remote(data) => { - data.session().send(ctx.session_id())?; - } - ClientDataEnum::Http(_) => (), - } - - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); - } - Ok(None) - } - pub(crate) fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); @@ -464,17 +443,10 @@ impl GlobalConnPool { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { - pools, total_conns, .. - } = pool.get_mut(); + let endpoints = pool.get_mut(); + clients_removed = endpoints.clear_closed(); - // ensure that closed clients are removed - for db_pool in pools.values_mut() { - clients_removed += db_pool.clear_closed_clients(total_conns); - } - - // we only remove this pool if it has no active connections - if *total_conns == 0 { + if endpoints.total_conns() == 0 { info!("pool: discarding pool for endpoint {endpoint}"); return false; } @@ -510,6 +482,62 @@ impl GlobalConnPool { info!("pool: performed global pool gc. size now {global_pool_size}"); } } +} + +impl GlobalConnPool> { + pub(crate) fn get( + self: &Arc, + ctx: &RequestContext, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; + + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); + } + let endpoint_pool = Arc::downgrade(&endpoint_pool); + + // ok return cached connection if found and establish a new one otherwise + if let Some(mut client) = client { + if client.inner.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); + } + tracing::Span::current() + .record("conn_id", tracing::field::display(client.get_conn_id())); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner.get_process_id()), + ); + debug!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + + match client.get_data() { + ClientDataEnum::Local(data) => { + data.session().send(ctx.session_id())?; + } + + ClientDataEnum::Remote(data) => { + data.session().send(ctx.session_id())?; + } + ClientDataEnum::Http(_) => (), + } + + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); + } + Ok(None) + } pub(crate) fn get_or_create_endpoint_pool( self: &Arc, @@ -556,7 +584,6 @@ impl GlobalConnPool { pool } } - pub(crate) struct Client { span: Span, inner: Option>, diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index a1d4473b01..fde38d0de3 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -2,17 +2,18 @@ use std::collections::VecDeque; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; -use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; -use rand::Rng; use tokio::net::TcpStream; use tracing::{debug, error, info, info_span, Instrument}; use super::backend::HttpConnError; -use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; -use crate::context::RequestMonitoring; +use super::conn_pool_lib::{ + ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, + EndpointConnPoolExt, GlobalConnPool, +}; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::types::EndpointCacheKey; @@ -23,17 +24,11 @@ pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] -pub(crate) struct ConnPoolEntry { - conn: C, - conn_id: uuid::Uuid, - aux: MetricsAuxInfo, -} - pub(crate) struct ClientDataHttp(); // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { +pub(crate) struct HttpConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) @@ -48,14 +43,19 @@ pub(crate) struct EndpointConnPool { global_connections_count: Arc, } -impl EndpointConnPool { +impl HttpConnPool { fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { let conn = conns.pop_front()?; - if !conn.conn.is_closed() { - conns.push_back(conn.clone()); + if !conn.conn.inner.is_closed() { + let new_conn = ConnPoolEntry { + conn: conn.conn.clone(), + _last_access: std::time::Instant::now(), + }; + + conns.push_back(new_conn); return Some(conn); } } @@ -69,7 +69,7 @@ impl EndpointConnPool { } = self; let old_len = conns.len(); - conns.retain(|conn| conn.conn_id != conn_id); + conns.retain(|entry| entry.conn.conn_id != conn_id); let new_len = conns.len(); let removed = old_len - new_len; if removed > 0 { @@ -84,7 +84,22 @@ impl EndpointConnPool { } } -impl Drop for EndpointConnPool { +impl EndpointConnPoolExt for HttpConnPool { + fn clear_closed(&mut self) -> usize { + let Self { conns, .. } = self; + let old_len = conns.len(); + conns.retain(|entry| !entry.conn.inner.is_closed()); + + let new_len = conns.len(); + old_len - new_len + } + + fn total_conns(&self) -> usize { + self.conns.len() + } +} + +impl Drop for HttpConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count @@ -98,121 +113,11 @@ impl Drop for EndpointConnPool { } } -pub(crate) struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>>, - - /// Number of endpoint-connection pools - /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - /// Total number of connections in the pool - global_connections_count: Arc, - - config: &'static crate::config::HttpConfig, -} - -impl GlobalConnPool { - pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { - let shards = config.pool_options.pool_shards; - Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), - global_pool_size: AtomicUsize::new(0), - config, - global_connections_count: Arc::new(AtomicUsize::new(0)), - }) - } - - pub(crate) fn shutdown(&self) { - // drops all strong references to endpoint-pools - self.global_pool.clear(); - } - - pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.config.pool_options.gc_epoch; - let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); - loop { - interval.tick().await; - - let shard = rng.gen_range(0..self.global_pool.shards().len()); - self.gc(shard); - } - } - - fn gc(&self, shard: usize) { - debug!(shard, "pool: performing epoch reclamation"); - - // acquire a random shard lock - let mut shard = self.global_pool.shards()[shard].write(); - - let timer = Metrics::get() - .proxy - .http_pool_reclaimation_lag_seconds - .start_timer(); - let current_len = shard.len(); - let mut clients_removed = 0; - shard.retain(|endpoint, x| { - // if the current endpoint pool is unique (no other strong or weak references) - // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { conns, .. } = pool.get_mut(); - - let old_len = conns.len(); - - conns.retain(|conn| !conn.conn.is_closed()); - - let new_len = conns.len(); - let removed = old_len - new_len; - clients_removed += removed; - - // we only remove this pool if it has no active connections - if conns.is_empty() { - info!("pool: discarding pool for endpoint {endpoint}"); - return false; - } - } - - true - }); - - let new_len = shard.len(); - drop(shard); - timer.observe(); - - // Do logging outside of the lock. - if clients_removed > 0 { - let size = self - .global_connections_count - .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - - clients_removed; - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); - } - let removed = current_len - new_len; - - if removed > 0 { - let global_pool_size = self - .global_pool_size - .fetch_sub(removed, atomic::Ordering::Relaxed) - - removed; - info!("pool: performed global pool gc. size now {global_pool_size}"); - } - } - +impl GlobalConnPool> { #[expect(unused_results)] pub(crate) fn get( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let result: Result>, HttpConnError>; @@ -226,27 +131,28 @@ impl GlobalConnPool { return result; }; - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - info!( + tracing::Span::current().record("conn_id", tracing::field::display(client.conn.conn_id)); + debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "pool: reusing connection '{conn_info}'" ); ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.success(); - Ok(Some(Client::new(client.conn, client.aux))) + + Ok(Some(Client::new(client.conn.clone()))) } fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc>> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); } // slow path - let new_pool = Arc::new(RwLock::new(EndpointConnPool { + let new_pool = Arc::new(RwLock::new(HttpConnPool { conns: VecDeque::new(), _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), @@ -279,8 +185,8 @@ impl GlobalConnPool { } pub(crate) fn poll_http2_client( - global_pool: Arc>, - ctx: &RequestMonitoring, + global_pool: Arc>>, + ctx: &RequestContext, conn_info: &ConnInfo, client: Send, connection: Connect, @@ -299,11 +205,15 @@ pub(crate) fn poll_http2_client( let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => { let pool = global_pool.get_or_create_endpoint_pool(&endpoint); - - pool.write().conns.push_back(ConnPoolEntry { - conn: client.clone(), - conn_id, + let client = ClientInnerCommon { + inner: client.clone(), aux: aux.clone(), + conn_id, + data: ClientDataEnum::Http(ClientDataHttp()), + }; + pool.write().conns.push_back(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), }); Metrics::get() .proxy @@ -335,23 +245,30 @@ pub(crate) fn poll_http2_client( .instrument(span), ); - Client::new(client, aux) + let client = ClientInnerCommon { + inner: client, + aux, + conn_id, + data: ClientDataEnum::Http(ClientDataHttp()), + }; + + Client::new(client) } pub(crate) struct Client { - pub(crate) inner: C, - aux: MetricsAuxInfo, + pub(crate) inner: ClientInnerCommon, } impl Client { - pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self { - Self { inner, aux } + pub(self) fn new(inner: ClientInnerCommon) -> Self { + Self { inner } } pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.aux; USAGE_METRICS.register(Ids { - endpoint_id: self.aux.endpoint_id, - branch_id: self.aux.branch_id, + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 99d4329f88..9abe35db08 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -29,14 +29,14 @@ use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; use super::backend::HttpConnError; use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn, EndpointConnPool, }; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; @@ -44,6 +44,7 @@ pub(crate) const EXT_NAME: &str = "pg_session_jwt"; pub(crate) const EXT_VERSION: &str = "0.1.2"; pub(crate) const EXT_SCHEMA: &str = "auth"; +#[derive(Clone)] pub(crate) struct ClientDataLocal { session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -88,7 +89,7 @@ impl LocalConnPool { pub(crate) fn get( self: &Arc, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let client = self @@ -110,7 +111,7 @@ impl LocalConnPool { "pid", tracing::field::display(client.inner.get_process_id()), ); - info!( + debug!( cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), "local_pool: reusing connection '{conn_info}'" ); @@ -159,7 +160,7 @@ impl LocalConnPool { #[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, - ctx: &RequestMonitoring, + ctx: &RequestContext, conn_info: ConnInfo, client: C, mut connection: tokio_postgres::Connection, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index cf758855fa..77025f419d 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -45,7 +45,7 @@ use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::metrics::Metrics; use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; use crate::proxy::run_until_cancelled; @@ -88,7 +88,7 @@ pub async fn task_main( } }); - let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config); + let http_conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let http_conn_pool = Arc::clone(&http_conn_pool); tokio::spawn(async move { @@ -423,7 +423,7 @@ async fn request_handler( if config.http_config.accept_websockets && framed_websockets::upgrade::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Ws, @@ -458,7 +458,7 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { - let ctx = RequestMonitoring::new( + let ctx = RequestContext::new( session_id, conn_info, crate::metrics::Protocol::Http, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f0975617d4..afd93d02f0 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,17 +8,17 @@ use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; -use hyper::body::{Body, Incoming}; +use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::{error, info}; +use tracing::{debug, error, info}; use typed_json::json; use url::Url; use urlencoding; @@ -34,8 +34,9 @@ use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::http::{read_body_with_limit, ReadBodyError}; use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::serverless::backend::HttpConnError; @@ -47,6 +48,7 @@ use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; struct QueryData { query: String, #[serde(deserialize_with = "bytes_to_pg_text")] + #[serde(default)] params: Vec>, #[serde(default)] array_mode: Option, @@ -133,7 +135,7 @@ impl UserFacingError for ConnInfoError { fn get_conn_info( config: &'static AuthenticationConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, headers: &HeaderMap, tls: Option<&TlsConfig>, ) -> Result { @@ -240,7 +242,7 @@ fn get_conn_info( pub(crate) async fn handle( config: &'static ProxyConfig, - ctx: RequestMonitoring, + ctx: RequestContext, request: Request, backend: Arc, cancel: CancellationToken, @@ -357,8 +359,6 @@ pub(crate) enum SqlOverHttpError { ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), - #[error("request is too large (max is {0} bytes)")] - RequestTooLarge(u64), #[error("response is too large (max is {0} bytes)")] ResponseTooLarge(usize), #[error("invalid isolation level")] @@ -377,7 +377,6 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), - SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User, SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), @@ -393,7 +392,6 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), - SqlOverHttpError::RequestTooLarge(_) => self.to_string(), SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), @@ -406,13 +404,12 @@ impl UserFacingError for SqlOverHttpError { impl HttpCodeError for SqlOverHttpError { fn get_http_status_code(&self) -> StatusCode { match self { - SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::ReadPayload(e) => e.get_http_status_code(), SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() { ErrorKind::User => StatusCode::BAD_REQUEST, _ => StatusCode::INTERNAL_SERVER_ERROR, }, SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST, - SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE, SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST, SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST, @@ -426,19 +423,41 @@ impl HttpCodeError for SqlOverHttpError { pub(crate) enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] Read(#[from] hyper::Error), + #[error("request is too large (max is {limit} bytes)")] + BodyTooLarge { limit: usize }, #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } +impl From> for ReadPayloadError { + fn from(value: ReadBodyError) -> Self { + match value { + ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, + ReadBodyError::Read(e) => Self::Read(e), + } + } +} + impl ReportableError for ReadPayloadError { fn get_error_kind(&self) -> ErrorKind { match self { ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, ReadPayloadError::Parse(_) => ErrorKind::User, } } } +impl HttpCodeError for ReadPayloadError { + fn get_http_status_code(&self) -> StatusCode { + match self { + ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, + ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, + ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, + } + } +} + #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] @@ -516,7 +535,7 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { @@ -562,7 +581,7 @@ async fn handle_inner( async fn handle_db_inner( cancel: CancellationToken, config: &'static ProxyConfig, - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, conn_info: ConnInfo, auth: AuthData, @@ -580,28 +599,20 @@ async fn handle_db_inner( let parsed_headers = HttpHeaders::try_parse(headers)?; - let request_content_length = match request.body().size_hint().upper() { - Some(v) => v, - None => config.http_config.max_request_size_bytes + 1, - }; - info!(request_content_length, "request size in bytes"); - Metrics::get() - .proxy - .http_conn_content_length_bytes - .observe(HttpDirection::Request, request_content_length as f64); - - // we don't have a streaming request support yet so this is to prevent OOM - // from a malicious user sending an extremely large request body - if request_content_length > config.http_config.max_request_size_bytes { - return Err(SqlOverHttpError::RequestTooLarge( - config.http_config.max_request_size_bytes, - )); - } - let fetch_and_process_request = Box::pin( async { - let body = request.into_body().collect().await?.to_bytes(); - info!(length = body.len(), "request payload read"); + let body = read_body_with_limit( + request.into_body(), + config.http_config.max_request_size_bytes, + ) + .await?; + + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, body.len() as f64); + + debug!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly } @@ -733,7 +744,7 @@ pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { } async fn handle_auth_broker_inner( - ctx: &RequestMonitoring, + ctx: &RequestContext, request: Request, conn_info: ConnInfo, jwt: String, @@ -768,6 +779,7 @@ async fn handle_auth_broker_inner( let _metrics = client.metrics(); Ok(client + .inner .inner .send_request(req) .await @@ -968,10 +980,11 @@ async fn query_to_json( current_size: &mut usize, parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { - info!("executing query"); + let query_start = Instant::now(); + let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); - info!("finished executing query"); + let query_acknowledged = Instant::now(); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too @@ -990,6 +1003,7 @@ async fn query_to_json( } } + let query_resp_end = Instant::now(); let ready = row_stream.ready_status(); // grab the command tag and number of rows affected @@ -1009,7 +1023,9 @@ async fn query_to_json( rows = rows.len(), ?ready, command_tag, - "finished reading rows" + acknowledgement = ?(query_acknowledged - query_start), + response = ?(query_resp_end - query_start), + "finished executing query" ); let columns_len = row_stream.columns().len(); @@ -1095,3 +1111,63 @@ impl Discard<'_> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_payload() { + let payload = "{\"query\":\"SELECT * FROM users WHERE name = ?\",\"params\":[\"test\"],\"arrayMode\":true}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Single(QueryData { + query, + params, + array_mode, + }) => { + assert_eq!(query, "SELECT * FROM users WHERE name = ?"); + assert_eq!(params, vec![Some(String::from("test"))]); + assert!(array_mode.unwrap()); + } + Payload::Batch(_) => { + panic!("deserialization failed: case with single query, one param, and array mode") + } + } + + let payload = "{\"queries\":[{\"query\":\"SELECT * FROM users0 WHERE name = ?\",\"params\":[\"test0\"], \"arrayMode\":false},{\"query\":\"SELECT * FROM users1 WHERE name = ?\",\"params\":[\"test1\"],\"arrayMode\":true}]}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Batch(BatchQueryData { queries }) => { + assert_eq!(queries.len(), 2); + for (i, query) in queries.into_iter().enumerate() { + assert_eq!( + query.query, + format!("SELECT * FROM users{i} WHERE name = ?") + ); + assert_eq!(query.params, vec![Some(format!("test{i}"))]); + assert_eq!(query.array_mode.unwrap(), i > 0); + } + } + Payload::Single(_) => panic!("deserialization failed: case with multiple queries"), + } + + let payload = "{\"query\":\"SELECT 1\"}"; + let deserialized_payload: Payload = serde_json::from_str(payload).unwrap(); + + match deserialized_payload { + Payload::Single(QueryData { + query, + params, + array_mode, + }) => { + assert_eq!(query, "SELECT 1"); + assert_eq!(params, vec![]); + assert!(array_mode.is_none()); + } + Payload::Batch(_) => panic!("deserialization failed: case with only one query"), + } + } +} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ba36116c2c..4088fea835 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -14,7 +14,7 @@ use tracing::warn; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; -use crate::context::RequestMonitoring; +use crate::context::RequestContext; use crate::error::{io_error, ReportableError}; use crate::metrics::Metrics; use crate::proxy::{handle_client, ClientMode, ErrorSource}; @@ -126,7 +126,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, - ctx: RequestMonitoring, + ctx: RequestContext, websocket: OnUpgrade, cancellation_handler: Arc, endpoint_rate_limiter: Arc, diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs index 514a83d5eb..0b675683c0 100644 --- a/proxy/src/signals.rs +++ b/proxy/src/signals.rs @@ -2,7 +2,7 @@ use std::convert::Infallible; use anyhow::bail; use tokio_util::sync::CancellationToken; -use tracing::warn; +use tracing::{info, warn}; /// Handle unix signals appropriately. pub async fn handle( @@ -22,7 +22,7 @@ where tokio::select! { // Hangup is commonly used for config reload. _ = hangup.recv() => { - warn!("received SIGHUP"); + info!("received SIGHUP"); refresh_config(); } // Shut down the whole application. diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 89df48c5d3..11f426819d 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -133,6 +133,7 @@ impl PqStream { msg: &'static str, error_kind: ErrorKind, ) -> Result { + // TODO: only log this for actually interesting errors tracing::info!( kind = error_kind.to_metric_label(), msg, diff --git a/pyproject.toml b/pyproject.toml index 9ea42bf46f..ccd3ab1864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ authors = [] package-mode = false [tool.poetry.dependencies] -python = "^3.9" +python = "^3.11" pytest = "^7.4.4" psycopg2-binary = "^2.9.10" typing-extensions = "^4.6.1" @@ -32,7 +32,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.10.2" +aiohttp = "3.10.11" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" @@ -51,7 +51,7 @@ testcontainers = "^4.8.1" jsonnet = "^0.20.0" [tool.poetry.group.dev.dependencies] -mypy = "==1.3.0" +mypy = "==1.13.0" ruff = "^0.7.0" [build-system] @@ -89,7 +89,7 @@ module = [ ignore_missing_imports = true [tool.ruff] -target-version = "py39" +target-version = "py311" extend-exclude = [ "vendor/", "target/", @@ -108,6 +108,3 @@ select = [ "B", # bugbear "UP", # pyupgrade ] - -[tool.ruff.lint.pyupgrade] -keep-runtime-typing = true # Remove this stanza when we require Python 3.10 diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 85561e4aff..ab77b63d54 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +pprof.workspace = true rand.workspace = true regex.workspace = true scopeguard.workspace = true diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md index 4119cc8d6e..d73fbccf05 100644 --- a/safekeeper/benches/README.md +++ b/safekeeper/benches/README.md @@ -14,6 +14,10 @@ cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false # List available benchmarks. cargo bench --package safekeeper --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index e32d7526ca..c637b4fb24 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -10,6 +10,7 @@ use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use pprof::criterion::{Output, PProfProfiler}; use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, @@ -24,8 +25,9 @@ const GB: usize = 1024 * MB; // Register benchmarks with Criterion. criterion_group!( - benches, - bench_process_msg, + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, bench_file_write diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index df68f8a68e..28294abdb9 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,7 +1,6 @@ -use hyper::{Body, Request, Response, StatusCode, Uri}; -use once_cell::sync::Lazy; +use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; @@ -14,7 +13,9 @@ use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; +use utils::http::endpoint::{ + profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, +}; use utils::http::request::parse_query_param; use postgres_ffi::WAL_SEGMENT_SIZE; @@ -572,14 +573,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - #[allow(clippy::mutable_key_type)] - static ALLOWLIST_ROUTES: Lazy> = Lazy::new(|| { - ["/v1/status", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect() - }); - if ALLOWLIST_ROUTES.contains(request.uri()) { + const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { // Option> is always provided as data below, hence unwrap(). @@ -598,6 +593,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(Arc::new(conf)) .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 2edcc4ef6f..bfa1764abf 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -239,6 +239,10 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { + // The `tli` parameter is only used for passing _out_ a timeline, one should + // not have been passed in. + assert!(tli.is_none()); + // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -256,6 +260,7 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); + let mut network_reader = NetworkReader { ttid: self.ttid, conn_id: self.conn_id, @@ -275,10 +280,14 @@ impl SafekeeperPostgresHandler { .subscribe(); *tli = Some(timeline.wal_residence_guard().await?); + let timeline_cancel = timeline.cancel.clone(); tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + _ = timeline_cancel.cancelled() => { + return Err(CopyStreamHandlerEnd::Cancelled); + } } } else { res.map(|_| ()) @@ -303,7 +312,7 @@ impl SafekeeperPostgresHandler { // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { - Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination + Ok(Ok(_)) => Ok(()), // Clean shutdown Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( "WalAcceptor task panicked", @@ -356,6 +365,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { Ok((tli, next_msg)) } + /// This function is cancellation-safe (only does network I/O and channel read/writes). async fn run( self, msg_tx: Sender, @@ -397,6 +407,7 @@ async fn read_network_loop( loop { let started = Instant::now(); let size = next_msg.size(); + match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await { Ok(()) => {} // Slow send, log a message and keep trying. Log context has timeline ID. @@ -428,6 +439,8 @@ async fn read_network_loop( /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should /// tell the error. +/// +/// This function is cancellation-safe (only does network I/O and channel read/writes). async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, @@ -461,7 +474,7 @@ async fn network_write( Some(AcceptorProposerMessage::AppendResponse(append_response)) } _ => None, - } + }, }; let Some(msg) = msg else { @@ -527,6 +540,10 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. + /// + /// This function is *not* cancellation safe, it does local disk I/O: it should always + /// be allowed to run to completion. It respects Timeline::cancel and shuts down cleanly + /// when that gets triggered. async fn run(&mut self) -> anyhow::Result<()> { let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); @@ -541,7 +558,7 @@ impl WalAcceptor { // Tracks whether we have unflushed appends. let mut dirty = false; - loop { + while !self.tli.is_cancelled() { let reply = tokio::select! { // Process inbound message. msg = self.msg_rx.recv() => { @@ -599,6 +616,10 @@ impl WalAcceptor { WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64); None // no reply } + + _ = self.tli.cancel.cancelled() => { + break; + } }; // Send reply, if any. @@ -610,7 +631,7 @@ impl WalAcceptor { } // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259. - if dirty { + if dirty && !self.tli.cancel.is_cancelled() { self.tli .process_msg(&ProposerAcceptorMessage::FlushWAL) .await?; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 6d94ff98b1..aa65ec851b 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -456,6 +456,8 @@ impl SafekeeperPostgresHandler { // not synchronized with sends, so this avoids deadlocks. let reader = pgb.split().context("START_REPLICATION split")?; + let tli_cancel = tli.cancel.clone(); + let mut sender = WalSender { pgb, // should succeed since we're already holding another guard @@ -479,6 +481,9 @@ impl SafekeeperPostgresHandler { // todo: add read|write .context to these errors r = sender.run() => r, r = reply_reader.run() => r, + _ = tli_cancel.cancelled() => { + return Err(CopyStreamHandlerEnd::Cancelled); + } }; let ws_state = ws_guard @@ -557,6 +562,7 @@ impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn) + /// - timeline's cancellation token fires /// /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? /// convenience. @@ -601,15 +607,14 @@ impl WalSender<'_, IO> { }; let send_buf = &send_buf[..send_size]; - // and send it - self.pgb - .write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: self.start_pos.0, - wal_end: self.end_pos.0, - timestamp: get_current_timestamp(), - data: send_buf, - })) - .await?; + // and send it, while respecting Timeline::cancel + let msg = BeMessage::XLogData(XLogDataBody { + wal_start: self.start_pos.0, + wal_end: self.end_pos.0, + timestamp: get_current_timestamp(), + data: send_buf, + }); + self.pgb.write_message(&msg).await?; if let Some(appname) = &self.appname { if appname == "replica" { @@ -674,13 +679,13 @@ impl WalSender<'_, IO> { } } - self.pgb - .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - wal_end: self.end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - })) - .await?; + let msg = BeMessage::KeepAlive(WalSndKeepAlive { + wal_end: self.end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + }); + + self.pgb.write_message(&msg).await?; } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 85add6bfea..ef928f7633 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; use utils::id::TenantId; +use utils::sync::gate::Gate; use std::cmp::max; use std::ops::{Deref, DerefMut}; @@ -467,6 +468,10 @@ pub struct Timeline { timeline_dir: Utf8PathBuf, manager_ctl: ManagerCtl, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding + /// this gate, you must respect [`Timeline::cancel`] + pub(crate) gate: Gate, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires pub(crate) cancel: CancellationToken, @@ -508,6 +513,7 @@ impl Timeline { mutex: RwLock::new(shared_state), walsenders: WalSenders::new(walreceivers.clone()), walreceivers, + gate: Default::default(), cancel: CancellationToken::default(), manager_ctl: ManagerCtl::new(), broker_active: AtomicBool::new(false), @@ -533,56 +539,6 @@ impl Timeline { )) } - /// Initialize fresh timeline on disk and start background tasks. If init - /// fails, timeline is cancelled and cannot be used anymore. - /// - /// Init is transactional, so if it fails, created files will be deleted, - /// and state on disk should remain unchanged. - pub async fn init_new( - self: &Arc, - shared_state: &mut WriteGuardSharedState<'_>, - conf: &SafeKeeperConf, - broker_active_set: Arc, - partial_backup_rate_limiter: RateLimiter, - ) -> Result<()> { - match fs::metadata(&self.timeline_dir).await { - Ok(_) => { - // Timeline directory exists on disk, we should leave state unchanged - // and return error. - bail!(TimelineError::Invalid(self.ttid)); - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => { - return Err(e.into()); - } - } - - // Create timeline directory. - fs::create_dir_all(&self.timeline_dir).await?; - - // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.state_mut().flush().await { - // Bootstrap failed, cancel timeline and remove timeline directory. - self.cancel(shared_state); - - if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await { - warn!( - "failed to remove timeline {} directory after bootstrap failure: {}", - self.ttid, fs_err - ); - } - - return Err(e); - } - self.bootstrap( - shared_state, - conf, - broker_active_set, - partial_backup_rate_limiter, - ); - Ok(()) - } - /// Bootstrap new or existing timeline starting background tasks. pub fn bootstrap( self: &Arc, @@ -593,33 +549,61 @@ impl Timeline { ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); + let Ok(gate_guard) = self.gate.enter() else { + // Init raced with shutdown + return; + }; + // Start manager task which will monitor timeline state and update // background tasks. - tokio::spawn(timeline_manager::main_task( - ManagerTimeline { tli: self.clone() }, - conf.clone(), - broker_active_set, - tx, - rx, - partial_backup_rate_limiter, - )); + tokio::spawn({ + let this = self.clone(); + let conf = conf.clone(); + async move { + let _gate_guard = gate_guard; + timeline_manager::main_task( + ManagerTimeline { tli: this }, + conf, + broker_active_set, + tx, + rx, + partial_backup_rate_limiter, + ) + .await + } + }); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. + pub async fn shutdown(&self) { + info!("timeline {} shutting down", self.ttid); + self.cancel.cancel(); + + // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts + // to read deleted files. + self.gate.close().await; } /// Delete timeline from disk completely, by removing timeline directory. - /// Background timeline activities will stop eventually. /// /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. + /// + /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - self.cancel(shared_state); + // Assert that [`Self::shutdown`] was already called + assert!(self.cancel.is_cancelled()); + assert!(self.gate.close_complete()); + + // Close associated FDs. Nobody will be able to touch timeline data once + // it is cancelled, so WAL storage won't be opened again. + shared_state.sk.close_wal_store(); - // TODO: It's better to wait for s3 offloader termination before - // removing data from s3. Though since s3 doesn't have transactions it - // still wouldn't guarantee absense of data after removal. let conf = GlobalTimelines::get_global_config(); if !only_local && conf.is_wal_backup_enabled() { // Note: we concurrently delete remote storage data from multiple @@ -631,16 +615,6 @@ impl Timeline { Ok(dir_existed) } - /// Cancel timeline to prevent further usage. Background tasks will stop - /// eventually after receiving cancellation signal. - fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { - info!("timeline {} is cancelled", self.ttid); - self.cancel.cancel(); - // Close associated FDs. Nobody will be able to touch timeline data once - // it is cancelled, so WAL storage won't be opened again. - shared_state.sk.close_wal_store(); - } - /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index 1ddac573d2..9102a40df8 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -7,6 +7,7 @@ use std::collections::HashSet; use tracing::debug; +use utils::sync::gate::GateGuard; use crate::timeline_manager::ManagerCtlMessage; @@ -16,6 +17,12 @@ pub struct GuardId(u64); pub struct ResidenceGuard { manager_tx: tokio::sync::mpsc::UnboundedSender, guard_id: GuardId, + + /// [`ResidenceGuard`] represents a guarantee that a timeline's data remains resident, + /// which by extension also means the timeline is not shut down (since after shut down + /// our data may be deleted). Therefore everyone holding a residence guard must also + /// hold a guard on [`crate::timeline::Timeline::gate`] + _gate_guard: GateGuard, } impl Drop for ResidenceGuard { @@ -52,7 +59,8 @@ impl AccessService { self.guards.is_empty() } - pub(crate) fn create_guard(&mut self) -> ResidenceGuard { + /// `timeline_gate_guard` is a guarantee that the timeline is not shut down + pub(crate) fn create_guard(&mut self, timeline_gate_guard: GateGuard) -> ResidenceGuard { let guard_id = self.next_guard_id; self.next_guard_id += 1; self.guards.insert(guard_id); @@ -63,6 +71,7 @@ impl AccessService { ResidenceGuard { manager_tx: self.manager_tx.clone(), guard_id, + _gate_guard: timeline_gate_guard, } } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index e9fed21bf5..c02fb904cf 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -266,8 +266,10 @@ pub async fn main_task( // Start recovery task which always runs on the timeline. if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled { - let tli = mgr.wal_resident_timeline(); - mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); + // Recovery task is only spawned if we can get a residence guard (i.e. timeline is not already shutting down) + if let Ok(tli) = mgr.wal_resident_timeline() { + mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); + } } // If timeline is evicted, reflect that in the metric. @@ -375,6 +377,13 @@ pub async fn main_task( // shutdown background tasks if mgr.conf.is_wal_backup_enabled() { + if let Some(backup_task) = mgr.backup_task.take() { + // If we fell through here, then the timeline is shutting down. This is important + // because otherwise joining on the wal_backup handle might hang. + assert!(mgr.tli.cancel.is_cancelled()); + + backup_task.join().await; + } wal_backup::update_task(&mut mgr, false, &last_state).await; } @@ -442,10 +451,18 @@ impl Manager { /// Get a WalResidentTimeline. /// Manager code must use this function instead of one from `Timeline` /// directly, because it will deadlock. - pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline { + /// + /// This function is fallible because the guard may not be created if the timeline is + /// shutting down. + pub(crate) fn wal_resident_timeline(&mut self) -> anyhow::Result { assert!(!self.is_offloaded); - let guard = self.access_service.create_guard(); - WalResidentTimeline::new(self.tli.clone(), guard) + let guard = self.access_service.create_guard( + self.tli + .gate + .enter() + .map_err(|_| anyhow::anyhow!("Timeline shutting down"))?, + ); + Ok(WalResidentTimeline::new(self.tli.clone(), guard)) } /// Get a snapshot of the timeline state. @@ -559,6 +576,11 @@ impl Manager { if removal_horizon_segno > self.last_removed_segno { // we need to remove WAL + let Ok(timeline_gate_guard) = self.tli.gate.enter() else { + tracing::info!("Timeline shutdown, not spawning WAL removal task"); + return; + }; + let remover = match self.tli.read_shared_state().await.sk { StateSK::Loaded(ref sk) => { crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno) @@ -573,6 +595,8 @@ impl Manager { self.wal_removal_task = Some(tokio::spawn( async move { + let _timeline_gate_guard = timeline_gate_guard; + remover.await?; Ok(removal_horizon_segno) } @@ -619,10 +643,15 @@ impl Manager { return; } + let Ok(resident) = self.wal_resident_timeline() else { + // Shutting down + return; + }; + // Get WalResidentTimeline and start partial backup task. let cancel = CancellationToken::new(); let handle = tokio::spawn(wal_backup_partial::main_task( - self.wal_resident_timeline(), + resident, self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), @@ -664,7 +693,7 @@ impl Manager { self.partial_backup_task = None; } - let tli = self.wal_resident_timeline(); + let tli = self.wal_resident_timeline()?; let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller @@ -688,7 +717,12 @@ impl Manager { let guard = if self.is_offloaded { Err(anyhow::anyhow!("timeline is offloaded, can't get a guard")) } else { - Ok(self.access_service.create_guard()) + match self.tli.gate.enter() { + Ok(gate_guard) => Ok(self.access_service.create_guard(gate_guard)), + Err(_) => Err(anyhow::anyhow!( + "timeline is shutting down, can't get a guard" + )), + } }; if tx.send(guard).is_err() { @@ -699,7 +733,10 @@ impl Manager { let result = if self.is_offloaded { None } else { - Some(self.access_service.create_guard()) + match self.tli.gate.enter() { + Ok(gate_guard) => Some(self.access_service.create_guard(gate_guard)), + Err(_) => None, + } }; if tx.send(result).is_err() { diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 33d94da034..067945fd5f 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -457,10 +457,12 @@ impl GlobalTimelines { Ok(timeline) => { let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, only_local={}", ttid, only_local); + timeline.shutdown().await; + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; - info!("deleting timeline {}, only_local={}", ttid, only_local); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; Ok(TimelineDeleteForceResult { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6c87e5a926..34b5dbeaa1 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -25,7 +25,6 @@ use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use tokio::sync::{watch, OnceCell}; -use tokio::time::sleep; use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -46,6 +45,14 @@ pub struct WalBackupTaskHandle { handle: JoinHandle<()>, } +impl WalBackupTaskHandle { + pub(crate) async fn join(self) { + if let Err(e) = self.handle.await { + error!("WAL backup task panicked: {}", e); + } + } +} + /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? pub(crate) fn is_wal_backup_required( wal_seg_size: usize, @@ -74,11 +81,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let async_task = backup_task_main( - mgr.wal_resident_timeline(), - mgr.conf.backup_parallel_jobs, - shutdown_rx, - ); + let Ok(resident) = mgr.wal_resident_timeline() else { + info!("Timeline shut down"); + return; + }; + + let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -108,9 +116,7 @@ async fn shut_down_task(entry: &mut Option) { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. - if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task panicked: {}", e); - } + wb_handle.join().await; } } @@ -214,6 +220,7 @@ async fn backup_task_main( let _guard = WAL_BACKUP_TASKS.guard(); info!("started"); + let cancel = tli.tli.cancel.clone(); let mut wb = WalBackupTask { wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), @@ -230,25 +237,34 @@ async fn backup_task_main( _ = wb.run() => {} _ = shutdown_rx.recv() => { canceled = true; + }, + _ = cancel.cancelled() => { + canceled = true; } } info!("task {}", if canceled { "canceled" } else { "terminated" }); } impl WalBackupTask { + /// This function must be called from a select! that also respects self.timeline's + /// cancellation token. This is done in [`backup_task_main`]. + /// + /// The future returned by this function is safe to drop at any time because it + /// does not write to local disk. async fn run(&mut self) { let mut backup_lsn = Lsn(0); let mut retry_attempt = 0u32; // offload loop - loop { + while !self.timeline.cancel.is_cancelled() { if retry_attempt == 0 { // wait for new WAL to arrive if let Err(e) = self.commit_lsn_watch_rx.changed().await { - // should never happen, as we hold Arc to timeline. + // should never happen, as we hold Arc to timeline and transmitter's lifetime + // is within Timeline's error!("commit_lsn watch shut down: {:?}", e); return; - } + }; } else { // or just sleep if we errored previously let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; @@ -256,7 +272,7 @@ impl WalBackupTask { { retry_delay = min(retry_delay, backoff_delay); } - sleep(Duration::from_millis(retry_delay)).await; + tokio::time::sleep(Duration::from_millis(retry_delay)).await; } let commit_lsn = *self.commit_lsn_watch_rx.borrow(); diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 9312f8b3e7..3fb668ed2d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -14,7 +14,7 @@ import psycopg2.extras import toml if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any FLAKY_TESTS_QUERY = """ SELECT @@ -65,7 +65,7 @@ def main(args: argparse.Namespace): pageserver_virtual_file_io_engine_parameter = "" # re-use existing records of flaky tests from before parametrization by compaction_algorithm - def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: """Duplicated from parametrize.py""" toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py index a4fd3f6132..835e28c5d6 100644 --- a/scripts/force_layer_download.py +++ b/scripts/force_layer_download.py @@ -194,9 +194,11 @@ async def main_impl(args, report_out, client: Client): tenant_ids = await client.get_tenant_ids() get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids] gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True) - assert len(tenant_ids) == len(gathered) tenant_and_timline_ids = [] - for tid, tlids in zip(tenant_ids, gathered): + for tid, tlids in zip(tenant_ids, gathered, strict=True): + # TODO: add error handling if tlids isinstance(Exception) + assert isinstance(tlids, list) + for tlid in tlids: tenant_and_timline_ids.append((tid, tlid)) elif len(comps) == 1: diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 40071c01b0..804f8a3cde 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -25,7 +25,8 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( metric_value NUMERIC, metric_unit VARCHAR(10), metric_report_type TEXT, - recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW() + recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + labels JSONB with default '{}' ) """ @@ -91,6 +92,7 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) "metric_unit": metric["unit"], "metric_report_type": metric["report"], "recorded_at_timestamp": datetime.utcfromtimestamp(recorded_at_timestamp), + "labels": json.dumps(metric.get("labels")), } args_list.append(values) @@ -105,7 +107,8 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) metric_value, metric_unit, metric_report_type, - recorded_at_timestamp + recorded_at_timestamp, + labels ) VALUES %s """, args_list, @@ -117,7 +120,8 @@ def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) %(metric_value)s, %(metric_unit)s, %(metric_report_type)s, - %(recorded_at_timestamp)s + %(recorded_at_timestamp)s, + %(labels)s )""", ) return len(args_list) diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index e0dd0a7189..c99cfa2b01 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -11,7 +11,7 @@ import re import sys from contextlib import contextmanager from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path import backoff @@ -140,8 +140,8 @@ def ingest_test_result( suite=labels["suite"], name=unparametrized_name, status=test["status"], - started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=timezone.utc), - stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc), + started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=UTC), + stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=UTC), duration=test["time"]["duration"], flaky=test["flaky"] or test["retriesStatusChange"], arch=arch, diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 88e36af560..95d3af1453 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -106,9 +106,9 @@ pub async fn find_large_objects( } } - let bucket_name = target.bucket_name(); + let desc_str = target.desc_str(); tracing::info!( - "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.", + "Scan of {desc_str} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.", objects.len() ); Ok(LargeObjectListing { objects }) diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 863dbf960d..b026efbc3b 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -21,7 +21,7 @@ use utils::{backoff, id::TenantId}; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants}, + metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; @@ -118,9 +118,17 @@ pub async fn find_garbage( console_config: ConsoleConfig, depth: TraversingDepth, node_kind: NodeKind, + tenant_id_prefix: Option, output_path: String, ) -> anyhow::Result<()> { - let garbage = find_garbage_inner(bucket_config, console_config, depth, node_kind).await?; + let garbage = find_garbage_inner( + bucket_config, + console_config, + depth, + node_kind, + tenant_id_prefix, + ) + .await?; let serialized = serde_json::to_vec_pretty(&garbage)?; tokio::fs::write(&output_path, &serialized).await?; @@ -152,6 +160,7 @@ async fn find_garbage_inner( console_config: ConsoleConfig, depth: TraversingDepth, node_kind: NodeKind, + tenant_id_prefix: Option, ) -> anyhow::Result { // Construct clients for S3 and for Console API let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?; @@ -177,8 +186,8 @@ async fn find_garbage_inner( })); // Enumerate Tenants in S3, and check if each one exists in Console - tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket); - let tenants = stream_tenants(&remote_client, &target); + tracing::info!("Finding all tenants in {}...", bucket_config.desc_str()); + let tenants = stream_tenants_maybe_prefix(&remote_client, &target, tenant_id_prefix); let tenants_checked = tenants.map_ok(|t| { let api_client = cloud_admin_api_client.clone(); let console_cache = console_cache.clone(); @@ -524,7 +533,7 @@ pub async fn purge_garbage( init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; assert_eq!( - &garbage_list.bucket_config.bucket, + garbage_list.bucket_config.bucket_name().unwrap(), remote_client.bucket_name().unwrap() ); diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index de0857cb5f..1fe4fc58cd 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -29,8 +29,7 @@ use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, - RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + RemoteStorageKind, S3Config, }; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -48,7 +47,7 @@ const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; #[derive(Debug, Clone)] pub struct S3Target { - pub bucket_name: String, + pub desc_str: String, /// This `prefix_in_bucket` is only equal to the PS/SK config of the same /// name for the RootTarget: other instances of S3Target will have prefix_in_bucket /// with extra parts. @@ -172,7 +171,7 @@ impl RootTarget { }; S3Target { - bucket_name: root.bucket_name.clone(), + desc_str: root.desc_str.clone(), prefix_in_bucket: format!( "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}", root.prefix_in_bucket @@ -209,10 +208,10 @@ impl RootTarget { } } - pub fn bucket_name(&self) -> &str { + pub fn desc_str(&self) -> &str { match self { - Self::Pageserver(root) => &root.bucket_name, - Self::Safekeeper(root) => &root.bucket_name, + Self::Pageserver(root) => &root.desc_str, + Self::Safekeeper(root) => &root.desc_str, } } @@ -230,24 +229,61 @@ pub fn remote_timeline_path_id(id: &TenantShardTimelineId) -> RemotePath { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub struct BucketConfig { - pub region: String, - pub bucket: String, - pub prefix_in_bucket: Option, -} +pub struct BucketConfig(RemoteStorageConfig); impl BucketConfig { pub fn from_env() -> anyhow::Result { - let region = env::var("REGION").context("'REGION' param retrieval")?; - let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; - let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); - - Ok(Self { - region, - bucket, - prefix_in_bucket, - }) + if let Ok(legacy) = Self::from_env_legacy() { + return Ok(legacy); + } + let config_toml = + env::var("REMOTE_STORAGE_CONFIG").context("'REMOTE_STORAGE_CONFIG' retrieval")?; + let remote_config = RemoteStorageConfig::from_toml_str(&config_toml)?; + Ok(BucketConfig(remote_config)) } + + fn from_env_legacy() -> anyhow::Result { + let bucket_region = env::var("REGION").context("'REGION' param retrieval")?; + let bucket_name = env::var("BUCKET").context("'BUCKET' param retrieval")?; + let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); + let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + // Create a json object which we then deserialize so that we don't + // have to repeat all of the S3Config fields. + let s3_config_json = serde_json::json!({ + "bucket_name": bucket_name, + "bucket_region": bucket_region, + "prefix_in_bucket": prefix_in_bucket, + "endpoint": endpoint, + }); + let config: RemoteStorageConfig = serde_json::from_value(s3_config_json)?; + Ok(BucketConfig(config)) + } + pub fn desc_str(&self) -> String { + match &self.0.storage { + RemoteStorageKind::LocalFs { local_path } => { + format!("local path {local_path}") + } + RemoteStorageKind::AwsS3(config) => format!( + "bucket {}, region {}", + config.bucket_name, config.bucket_region + ), + RemoteStorageKind::AzureContainer(config) => format!( + "bucket {}, storage account {:?}, region {}", + config.container_name, config.storage_account, config.container_region + ), + } + } + pub fn bucket_name(&self) -> Option<&str> { + self.0.storage.bucket_name() + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct BucketConfigLegacy { + pub region: String, + pub bucket: String, + pub prefix_in_bucket: Option, } pub struct ControllerClientConfig { @@ -337,13 +373,9 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str { } } -fn make_root_target( - bucket_name: String, - prefix_in_bucket: String, - node_kind: NodeKind, -) -> RootTarget { +fn make_root_target(desc_str: String, prefix_in_bucket: String, node_kind: NodeKind) -> RootTarget { let s3_target = S3Target { - bucket_name, + desc_str, prefix_in_bucket, delimiter: "/".to_string(), }; @@ -354,15 +386,15 @@ fn make_root_target( } async fn init_remote_s3( - bucket_config: BucketConfig, + bucket_config: S3Config, node_kind: NodeKind, ) -> anyhow::Result<(Arc, RootTarget)> { - let bucket_region = Region::new(bucket_config.region); + let bucket_region = Region::new(bucket_config.bucket_region); let s3_client = Arc::new(init_s3_client(bucket_region).await); let default_prefix = default_prefix_in_bucket(node_kind).to_string(); let s3_root = make_root_target( - bucket_config.bucket, + bucket_config.bucket_name, bucket_config.prefix_in_bucket.unwrap_or(default_prefix), node_kind, ); @@ -371,33 +403,28 @@ async fn init_remote_s3( } async fn init_remote( - bucket_config: BucketConfig, + mut storage_config: BucketConfig, node_kind: NodeKind, ) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> { - let endpoint = env::var("AWS_ENDPOINT_URL").ok(); + let desc_str = storage_config.desc_str(); + let default_prefix = default_prefix_in_bucket(node_kind).to_string(); - let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix)); - let storage = S3Config { - bucket_name: bucket_config.bucket.clone(), - bucket_region: bucket_config.region, - prefix_in_bucket, - endpoint, - concurrency_limit: DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT - .try_into() - .unwrap(), - max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - upload_storage_class: None, - }; - let storage_config = RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(storage), - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }; + + match &mut storage_config.0.storage { + RemoteStorageKind::AwsS3(ref mut config) => { + config.prefix_in_bucket.get_or_insert(default_prefix); + } + RemoteStorageKind::AzureContainer(ref mut config) => { + config.prefix_in_container.get_or_insert(default_prefix); + } + RemoteStorageKind::LocalFs { .. } => (), + } // We already pass the prefix to the remote client above let prefix_in_root_target = String::new(); - let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind); + let root_target = make_root_target(desc_str, prefix_in_root_target, node_kind); - let client = GenericRemoteStorage::from_config(&storage_config).await?; + let client = GenericRemoteStorage::from_config(&storage_config.0).await?; Ok((client, root_target)) } @@ -469,7 +496,7 @@ async fn list_objects_with_retries( } warn!( "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", - s3_target.bucket_name, + remote_client.bucket_name().unwrap_or_default(), s3_target.prefix_in_bucket, s3_target.delimiter, DisplayErrorContext(e), diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index ee816534c6..92979d609e 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -54,6 +54,8 @@ enum Command { node_kind: NodeKind, #[arg(short, long, default_value_t=TraversingDepth::Tenant)] depth: TraversingDepth, + #[arg(short, long, default_value=None)] + tenant_id_prefix: Option, #[arg(short, long, default_value_t = String::from("garbage.json"))] output_path: String, }, @@ -140,7 +142,7 @@ async fn main() -> anyhow::Result<()> { "{}_{}_{}_{}.log", std::env::args().next().unwrap(), command_log_name, - bucket_config.bucket, + bucket_config.bucket_name().unwrap_or("nobucket"), chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); @@ -191,13 +193,7 @@ async fn main() -> anyhow::Result<()> { // Strictly speaking an empty bucket is a valid bucket, but if someone ran the // scrubber they were likely expecting to scan something, and if we see no timelines // at all then it's likely due to some configuration issues like a bad prefix - bail!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - ); + bail!("No timelines found in {}", bucket_config.desc_str()); } Ok(()) } else { @@ -215,10 +211,19 @@ async fn main() -> anyhow::Result<()> { Command::FindGarbage { node_kind, depth, + tenant_id_prefix, output_path, } => { let console_config = ConsoleConfig::from_env()?; - find_garbage(bucket_config, console_config, depth, node_kind, output_path).await + find_garbage( + bucket_config, + console_config, + depth, + node_kind, + tenant_id_prefix, + output_path, + ) + .await } Command::PurgeGarbage { input_path, @@ -396,13 +401,7 @@ pub async fn scan_pageserver_metadata_cmd( // Strictly speaking an empty bucket is a valid bucket, but if someone ran the // scrubber they were likely expecting to scan something, and if we see no timelines // at all then it's likely due to some configuration issues like a bad prefix - tracing::error!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - ); + tracing::error!("No timelines found in {}", bucket_config.desc_str()); if exit_code { std::process::exit(1); } diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index f896cff2d5..47447d681c 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -17,9 +17,20 @@ use utils::id::{TenantId, TimelineId}; pub fn stream_tenants<'a>( remote_client: &'a GenericRemoteStorage, target: &'a RootTarget, +) -> impl Stream> + 'a { + stream_tenants_maybe_prefix(remote_client, target, None) +} +/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes +pub fn stream_tenants_maybe_prefix<'a>( + remote_client: &'a GenericRemoteStorage, + target: &'a RootTarget, + tenant_id_prefix: Option, ) -> impl Stream> + 'a { try_stream! { - let tenants_target = target.tenants_root(); + let mut tenants_target = target.tenants_root(); + if let Some(tenant_id_prefix) = tenant_id_prefix { + tenants_target.prefix_in_bucket += &tenant_id_prefix; + } let mut tenants_stream = std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target)); while let Some(chunk) = tenants_stream.next().await { @@ -60,7 +71,7 @@ pub async fn stream_tenant_shards<'a>( first_part .parse::() - .with_context(|| format!("Incorrect entry id str: {first_part}")) + .with_context(|| format!("Incorrect tenant entry id str: {first_part}")) }) .collect::>(); @@ -114,9 +125,10 @@ pub async fn stream_tenant_timelines<'a>( prefix.get_path().as_str().strip_prefix(prefix_str) }) .map(|entry_id_str| { - entry_id_str + let first_part = entry_id_str.split('/').next().unwrap(); + first_part .parse::() - .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + .with_context(|| format!("Incorrect timeline entry id str: {entry_id_str}")) }); for i in new_entry_ids { diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 403b4590a8..0a4d4266a0 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -84,10 +84,7 @@ pub async fn scan_safekeeper_metadata( bucket_config: BucketConfig, db_or_list: DatabaseOrList, ) -> anyhow::Result { - info!( - "checking bucket {}, region {}", - bucket_config.bucket, bucket_config.region - ); + info!("checking {}", bucket_config.desc_str()); let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index bb4079b5f4..39e0b5c9b4 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -16,7 +16,7 @@ use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, S3Config}; use utils::generation::Generation; use utils::id::TenantId; @@ -24,6 +24,7 @@ pub struct SnapshotDownloader { s3_client: Arc, s3_root: RootTarget, bucket_config: BucketConfig, + bucket_config_s3: S3Config, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -36,12 +37,17 @@ impl SnapshotDownloader { output_path: Utf8PathBuf, concurrency: usize, ) -> anyhow::Result { + let bucket_config_s3 = match &bucket_config.0.storage { + remote_storage::RemoteStorageKind::AwsS3(config) => config.clone(), + _ => panic!("only S3 configuration is supported for snapshot downloading"), + }; let (s3_client, s3_root) = - init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?; + init_remote_s3(bucket_config_s3.clone(), NodeKind::Pageserver).await?; Ok(Self { s3_client, s3_root, bucket_config, + bucket_config_s3, tenant_id, output_path, concurrency, @@ -87,7 +93,7 @@ impl SnapshotDownloader { let versions = self .s3_client .list_object_versions() - .bucket(self.bucket_config.bucket.clone()) + .bucket(self.bucket_config_s3.bucket_name.clone()) .prefix(&remote_layer_path) .send() .await?; @@ -96,7 +102,7 @@ impl SnapshotDownloader { }; download_object_to_file_s3( &self.s3_client, - &self.bucket_config.bucket, + &self.bucket_config_s3.bucket_name, &remote_layer_path, version.version_id.as_deref(), &local_path, diff --git a/test_runner/README.md b/test_runner/README.md index 55d8d2faa9..f342ef8aaa 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -113,7 +113,7 @@ The test suite has a Python enum with equal name but different meaning: ```python @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py index be16be81de..8382ce20b3 100644 --- a/test_runner/fixtures/auth_tokens.py +++ b/test_runner/fixtures/auth_tokens.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from typing import Any import jwt @@ -37,8 +37,7 @@ class AuthKeys: return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): +class TokenScope(StrEnum): ADMIN = "admin" PAGE_SERVER_API = "pageserverapi" GENERATIONS_API = "generations_api" diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index d3419bd8b1..bb8e75902e 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -9,6 +9,7 @@ import re import timeit from contextlib import contextmanager from datetime import datetime +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING @@ -24,8 +25,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver if TYPE_CHECKING: - from collections.abc import Iterator, Mapping - from typing import Callable, Optional + from collections.abc import Callable, Iterator, Mapping """ @@ -61,7 +61,7 @@ class PgBenchRunResult: number_of_threads: int number_of_transactions_actually_processed: int latency_average: float - latency_stddev: Optional[float] + latency_stddev: float | None tps: float run_duration: float run_start_timestamp: int @@ -171,14 +171,14 @@ _PGBENCH_INIT_EXTRACTORS: Mapping[str, re.Pattern[str]] = { @dataclasses.dataclass class PgBenchInitResult: - total: Optional[float] - drop_tables: Optional[float] - create_tables: Optional[float] - client_side_generate: Optional[float] - server_side_generate: Optional[float] - vacuum: Optional[float] - primary_keys: Optional[float] - foreign_keys: Optional[float] + total: float | None + drop_tables: float | None + create_tables: float | None + client_side_generate: float | None + server_side_generate: float | None + vacuum: float | None + primary_keys: float | None + foreign_keys: float | None duration: float start_timestamp: int end_timestamp: int @@ -196,7 +196,7 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - timings: dict[str, Optional[float]] = {} + timings: dict[str, float | None] = {} last_line_items = re.split(r"\(|\)|,", last_line) for item in last_line_items: for key, regex in _PGBENCH_INIT_EXTRACTORS.items(): @@ -227,7 +227,7 @@ class PgBenchInitResult: @enum.unique -class MetricReport(str, enum.Enum): # str is a hack to make it json serializable +class MetricReport(StrEnum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients TEST_PARAM = "test_param" @@ -256,12 +256,16 @@ class NeonBenchmarker: metric_value: float, unit: str, report: MetricReport, + # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric + labels: dict[str, str] | None = None, ): """ Record a benchmark result. """ # just to namespace the value name = f"{self.PROPERTY_PREFIX}_{metric_name}" + if labels is None: + labels = {} self.property_recorder( name, { @@ -269,6 +273,7 @@ class NeonBenchmarker: "value": metric_value, "unit": unit, "report": report, + "labels": labels, }, ) @@ -406,7 +411,7 @@ class NeonBenchmarker: self, pageserver: NeonPageserver, metric_name: str, - label_filters: Optional[dict[str, str]] = None, + label_filters: dict[str, str] | None = None, ) -> int: """Fetch the value of given int counter from pageserver metrics.""" all_metrics = pageserver.http_client().get_metrics() diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 0ea7148f50..6c22b31e00 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -2,14 +2,14 @@ from __future__ import annotations import random from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from functools import total_ordering from typing import TYPE_CHECKING, TypeVar from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Union + from typing import Any T = TypeVar("T", bound="Id") @@ -24,7 +24,7 @@ class Lsn: representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ - def __init__(self, x: Union[int, str]): + def __init__(self, x: int | str): if isinstance(x, int): self.lsn_int = x else: @@ -67,7 +67,7 @@ class Lsn: return NotImplemented return self.lsn_int - other.lsn_int - def __add__(self, other: Union[int, Lsn]) -> Lsn: + def __add__(self, other: int | Lsn) -> Lsn: if isinstance(other, int): return Lsn(self.lsn_int + other) elif isinstance(other, Lsn): @@ -190,8 +190,23 @@ class TenantTimelineId: ) -# Workaround for compat with python 3.9, which does not have `typing.Self` -TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") +@dataclass +class ShardIndex: + shard_number: int + shard_count: int + + # cf impl Display for ShardIndex + @override + def __str__(self) -> str: + return f"{self.shard_number:02x}{self.shard_count:02x}" + + @classmethod + def parse(cls: type[ShardIndex], input: str) -> ShardIndex: + assert len(input) == 4 + return cls( + shard_number=int(input[0:2], 16), + shard_count=int(input[2:4], 16), + ) class TenantShardId: @@ -202,7 +217,7 @@ class TenantShardId: assert self.shard_number < self.shard_count or self.shard_count == 0 @classmethod - def parse(cls: type[TTenantShardId], input: str) -> TTenantShardId: + def parse(cls: type[TenantShardId], input: str) -> TenantShardId: if len(input) == 32: return cls( tenant_id=TenantId(input), @@ -226,6 +241,10 @@ class TenantShardId: # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) return str(self.tenant_id) + @property + def shard_index(self) -> ShardIndex: + return ShardIndex(self.shard_number, self.shard_count) + @override def __repr__(self): return self.__str__() @@ -249,7 +268,6 @@ class TenantShardId: return hash(self._tuple()) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TimelineArchivalState(str, Enum): +class TimelineArchivalState(StrEnum): ARCHIVED = "Archived" UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 85b6e7a3b8..c0892399bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -99,7 +99,7 @@ class PgCompare(ABC): assert row is not None assert len(row) == len(pg_stat.columns) - for col, val in zip(pg_stat.columns, row): + for col, val in zip(pg_stat.columns, row, strict=False): results[f"{pg_stat.table}.{col}"] = int(val) return results diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 6354b7f833..33f01f80fb 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -12,7 +12,8 @@ from fixtures.common_types import TenantId from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any class ComputeReconfigure: @@ -20,12 +21,12 @@ class ComputeReconfigure: self.server = server self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" self.workloads: dict[TenantId, Any] = {} - self.on_notify: Optional[Callable[[Any], None]] = None + self.on_notify: Callable[[Any], None] | None = None def register_workload(self, workload: Any): self.workloads[workload.tenant_id] = workload - def register_on_notify(self, fn: Optional[Callable[[Any], None]]): + def register_on_notify(self, fn: Callable[[Any], None] | None): """ Add some extra work during a notification, like sleeping to slow things down, or logging what was notified. @@ -68,7 +69,7 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] return Response(status=200) diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py index e890b2bcf1..3e35af3b5b 100644 --- a/test_runner/fixtures/h2server.py +++ b/test_runner/fixtures/h2server.py @@ -31,7 +31,7 @@ from h2.settings import SettingCodes from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any RequestData = collections.namedtuple("RequestData", ["headers", "data"]) @@ -49,7 +49,7 @@ class H2Protocol(asyncio.Protocol): def __init__(self): config = H2Configuration(client_side=False, header_encoding="utf-8") self.conn = H2Connection(config=config) - self.transport: Optional[asyncio.Transport] = None + self.transport: asyncio.Transport | None = None self.stream_data: dict[int, RequestData] = {} self.flow_control_futures: dict[int, asyncio.Future[Any]] = {} @@ -61,7 +61,7 @@ class H2Protocol(asyncio.Protocol): self.transport.write(self.conn.data_to_send()) @override - def connection_lost(self, exc: Optional[Exception]): + def connection_lost(self, exc: Exception | None): for future in self.flow_control_futures.values(): future.cancel() self.flow_control_futures = {} diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 39c8f70a9c..330f007a77 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,16 +1,12 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Optional - class Metrics: metrics: dict[str, list[Sample]] @@ -20,7 +16,7 @@ class Metrics: self.metrics = defaultdict(list) self.name = name - def query_all(self, name: str, filter: Optional[dict[str, str]] = None) -> list[Sample]: + def query_all(self, name: str, filter: dict[str, str] | None = None) -> list[Sample]: filter = filter or {} res: list[Sample] = [] @@ -32,7 +28,7 @@ class Metrics: pass return res - def query_one(self, name: str, filter: Optional[dict[str, str]] = None) -> Sample: + def query_one(self, name: str, filter: dict[str, str] | None = None) -> Sample: res = self.query_all(name, filter or {}) assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" return res[0] @@ -47,9 +43,7 @@ class MetricsGetter: def get_metrics(self) -> Metrics: raise NotImplementedError() - def get_metric_value( - self, name: str, filter: Optional[dict[str, str]] = None - ) -> Optional[float]: + def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None: metrics = self.get_metrics() results = metrics.query_all(name, filter=filter) if not results: @@ -59,7 +53,7 @@ class MetricsGetter: return results[0].value def get_metrics_values( - self, names: list[str], filter: Optional[dict[str, str]] = None, absence_ok: bool = False + self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False ) -> dict[str, float]: """ When fetching multiple named metrics, it is more efficient to use this diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 9de6681beb..df80f0683c 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -8,7 +8,7 @@ import requests from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Literal, Optional + from typing import Any, Literal from fixtures.pg_version import PgVersion @@ -40,11 +40,11 @@ class NeonAPI: def create_project( self, - pg_version: Optional[PgVersion] = None, - name: Optional[str] = None, - branch_name: Optional[str] = None, - branch_role_name: Optional[str] = None, - branch_database_name: Optional[str] = None, + pg_version: PgVersion | None = None, + name: str | None = None, + branch_name: str | None = None, + branch_role_name: str | None = None, + branch_database_name: str | None = None, ) -> dict[str, Any]: data: dict[str, Any] = { "project": { @@ -179,8 +179,8 @@ class NeonAPI: def get_connection_uri( self, project_id: str, - branch_id: Optional[str] = None, - endpoint_id: Optional[str] = None, + branch_id: str | None = None, + endpoint_id: str | None = None, database_name: str = "neondb", role_name: str = "neondb_owner", pooled: bool = True, @@ -249,7 +249,7 @@ class NeonAPI: @final class NeonApiEndpoint: - def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None): self.neon_api = neon_api self.project_id: str self.endpoint_id: str diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index d220ea57a2..a85a191455 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -20,13 +20,9 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from typing import ( Any, - Optional, - TypeVar, cast, ) - T = TypeVar("T") - # Used to be an ABC. abc.ABC removed due to linter without name change. class AbstractNeonCli: @@ -36,7 +32,7 @@ class AbstractNeonCli: Do not use directly, use specific subclasses instead. """ - def __init__(self, extra_env: Optional[dict[str, str]], binpath: Path): + def __init__(self, extra_env: dict[str, str] | None, binpath: Path): self.extra_env = extra_env self.binpath = binpath @@ -45,7 +41,7 @@ class AbstractNeonCli: def raw_cli( self, arguments: list[str], - extra_env_vars: Optional[dict[str, str]] = None, + extra_env_vars: dict[str, str] | None = None, check_return_code=True, timeout=None, ) -> subprocess.CompletedProcess[str]: @@ -173,7 +169,7 @@ class NeonLocalCli(AbstractNeonCli): def __init__( self, - extra_env: Optional[dict[str, str]], + extra_env: dict[str, str] | None, binpath: Path, repo_dir: Path, pg_distrib_dir: Path, @@ -195,10 +191,10 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, pg_version: PgVersion, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ): """ @@ -302,8 +298,8 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, new_branch_name, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, ): cmd = [ "timeline", @@ -331,8 +327,8 @@ class NeonLocalCli(AbstractNeonCli): base_lsn: Lsn, base_tarfile: Path, pg_version: PgVersion, - end_lsn: Optional[Lsn] = None, - wal_tarfile: Optional[Path] = None, + end_lsn: Lsn | None = None, + wal_tarfile: Path | None = None, ): cmd = [ "timeline", @@ -380,7 +376,7 @@ class NeonLocalCli(AbstractNeonCli): def init( self, init_config: dict[str, Any], - force: Optional[str] = None, + force: str | None = None, ) -> subprocess.CompletedProcess[str]: with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: init_config_tmpfile.write(toml.dumps(init_config)) @@ -400,9 +396,9 @@ class NeonLocalCli(AbstractNeonCli): def storage_controller_start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: @@ -413,7 +409,7 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): + def storage_controller_stop(self, immediate: bool, instance_id: int | None = None): cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) @@ -424,8 +420,8 @@ class NeonLocalCli(AbstractNeonCli): def pageserver_start( self, id: int, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: start_args = ["pageserver", "start", f"--id={id}"] if timeout_in_seconds is not None: @@ -442,9 +438,9 @@ class NeonLocalCli(AbstractNeonCli): def safekeeper_start( self, id: int, - extra_opts: Optional[list[str]] = None, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_opts: list[str] | None = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: if extra_opts is not None: extra_opts = [f"-e={opt}" for opt in extra_opts] @@ -457,7 +453,7 @@ class NeonLocalCli(AbstractNeonCli): ) def safekeeper_stop( - self, id: Optional[int] = None, immediate=False + self, id: int | None = None, immediate=False ) -> subprocess.CompletedProcess[str]: args = ["safekeeper", "stop"] if id is not None: @@ -467,7 +463,7 @@ class NeonLocalCli(AbstractNeonCli): return self.raw_cli(args) def storage_broker_start( - self, timeout_in_seconds: Optional[int] = None + self, timeout_in_seconds: int | None = None ) -> subprocess.CompletedProcess[str]: cmd = ["storage_broker", "start"] if timeout_in_seconds is not None: @@ -485,10 +481,10 @@ class NeonLocalCli(AbstractNeonCli): http_port: int, tenant_id: TenantId, pg_version: PgVersion, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + pageserver_id: int | None = None, allow_multiple=False, ) -> subprocess.CompletedProcess[str]: args = [ @@ -523,11 +519,11 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, - safekeepers: Optional[list[int]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + safekeepers: list[int] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple=False, - basebackup_request_tries: Optional[int] = None, + basebackup_request_tries: int | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -555,9 +551,9 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_reconfigure( self, endpoint_id: str, - tenant_id: Optional[TenantId] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + tenant_id: TenantId | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, check_return_code=True, ) -> subprocess.CompletedProcess[str]: args = ["endpoint", "reconfigure", endpoint_id] @@ -574,7 +570,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, - mode: Optional[str] = None, + mode: str | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 205a47a9d5..78e2422171 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,7 +17,7 @@ from collections.abc import Iterable, Iterator from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime -from enum import Enum +from enum import StrEnum from functools import cached_property from pathlib import Path from types import TracebackType @@ -101,13 +101,8 @@ from fixtures.utils import ( from .neon_api import NeonAPI, NeonApiEndpoint if TYPE_CHECKING: - from typing import ( - Any, - Callable, - Optional, - TypeVar, - Union, - ) + from collections.abc import Callable + from typing import Any, Self, TypeVar from fixtures.paths import SnapshotDirLocked @@ -338,10 +333,10 @@ class NeonEnvBuilder: top_output_dir: Path, test_output_dir: Path, combination, - test_overlay_dir: Optional[Path] = None, - pageserver_remote_storage: Optional[RemoteStorage] = None, + test_overlay_dir: Path | None = None, + pageserver_remote_storage: RemoteStorage | None = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` - pageserver_config_override: Optional[str | Callable[[dict[str, Any]], None]] = None, + pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -349,16 +344,16 @@ class NeonEnvBuilder: # fsync is disabled by default to make the tests go faster safekeepers_enable_fsync: bool = False, auth_enabled: bool = False, - rust_log_override: Optional[str] = None, + rust_log_override: str | None = None, default_branch_name: str = DEFAULT_BRANCH_NAME, preserve_database_files: bool = False, - initial_tenant: Optional[TenantId] = None, - initial_timeline: Optional[TimelineId] = None, - pageserver_virtual_file_io_engine: Optional[str] = None, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None, - safekeeper_extra_opts: Optional[list[str]] = None, - storage_controller_port_override: Optional[int] = None, - pageserver_virtual_file_io_mode: Optional[str] = None, + initial_tenant: TenantId | None = None, + initial_timeline: TimelineId | None = None, + pageserver_virtual_file_io_engine: str | None = None, + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = None, + safekeeper_extra_opts: list[str] | None = None, + storage_controller_port_override: int | None = None, + pageserver_virtual_file_io_mode: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -367,7 +362,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.safekeepers_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: RemoteStorage | None = None self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server @@ -378,7 +373,7 @@ class NeonEnvBuilder: self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name - self.env: Optional[NeonEnv] = None + self.env: NeonEnv | None = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath self.neon_local_binpath = neon_binpath @@ -391,14 +386,14 @@ class NeonEnvBuilder: self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] - self.config_init_force: Optional[str] = None + self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: Optional[str] = None - self.storage_controller_config: Optional[dict[Any, Any]] = None + self.control_plane_compute_hook_api: str | None = None + self.storage_controller_config: dict[Any, Any] | None = None - self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine - self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = ( + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) if self.pageserver_default_tenant_config_compaction_algorithm is not None: @@ -440,10 +435,10 @@ class NeonEnvBuilder: def init_start( self, - initial_tenant_conf: Optional[dict[str, Any]] = None, + initial_tenant_conf: dict[str, Any] | None = None, default_remote_storage_if_missing: bool = True, - initial_tenant_shard_count: Optional[int] = None, - initial_tenant_shard_stripe_size: Optional[int] = None, + initial_tenant_shard_count: int | None = None, + initial_tenant_shard_stripe_size: int | None = None, ) -> NeonEnv: """ Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline. @@ -781,8 +776,8 @@ class NeonEnvBuilder: self, kind: RemoteStorageKind, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: ret = kind.configure( self.repo_dir, @@ -840,14 +835,14 @@ class NeonEnvBuilder: if isinstance(x, S3Storage): x.do_cleanup() - def __enter__(self) -> NeonEnvBuilder: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, ): # Stop all the nodes. if self.env: @@ -1136,7 +1131,7 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: Optional[int] = None): + def start(self, timeout_in_seconds: int | None = None): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1150,21 +1145,19 @@ class NeonEnv: with concurrent.futures.ThreadPoolExecutor( max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: - futs.append( - executor.submit(lambda: self.broker.start() or None) - ) # The `or None` is for the linter + futs.append(executor.submit(lambda: self.broker.start())) for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in self.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) @@ -1237,7 +1230,7 @@ class NeonEnv: ), "env.pageserver must only be used with single pageserver NeonEnv" return self.pageservers[0] - def get_pageserver(self, id: Optional[int]) -> NeonPageserver: + def get_pageserver(self, id: int | None) -> NeonPageserver: """ Look up a pageserver by its node ID. @@ -1254,7 +1247,7 @@ class NeonEnv: raise RuntimeError(f"Pageserver with ID {id} not found") - def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): + def get_tenant_pageserver(self, tenant_id: TenantId | TenantShardId): """ Get the NeonPageserver where this tenant shard is currently attached, according to the storage controller. @@ -1316,12 +1309,12 @@ class NeonEnv: def create_tenant( self, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ) -> tuple[TenantId, TimelineId]: """ @@ -1343,7 +1336,7 @@ class NeonEnv: return tenant_id, timeline_id - def config_tenant(self, tenant_id: Optional[TenantId], conf: dict[str, str]): + def config_tenant(self, tenant_id: TenantId | None, conf: dict[str, str]): """ Update tenant config. """ @@ -1353,10 +1346,10 @@ class NeonEnv: def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, - tenant_id: Optional[TenantId] = None, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, - new_timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, + new_timeline_id: TimelineId | None = None, ) -> TimelineId: new_timeline_id = new_timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1370,8 +1363,8 @@ class NeonEnv: def create_timeline( self, new_branch_name: str, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, ) -> TimelineId: timeline_id = timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1396,8 +1389,8 @@ def neon_simple_env( compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1453,9 +1446,9 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1530,7 +1523,7 @@ class LogUtils: def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: """Check that the log contains a line that matches the given regex""" logfile = self.logfile if not logfile.exists(): @@ -1569,14 +1562,13 @@ class StorageControllerApiException(Exception): # See libs/pageserver_api/src/controller_api.rs # for the rust definitions of the enums below -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class PageserverAvailability(str, Enum): +class PageserverAvailability(StrEnum): ACTIVE = "Active" UNAVAILABLE = "Unavailable" OFFLINE = "Offline" -class PageserverSchedulingPolicy(str, Enum): +class PageserverSchedulingPolicy(StrEnum): ACTIVE = "Active" DRAINING = "Draining" FILLING = "Filling" @@ -1584,7 +1576,7 @@ class PageserverSchedulingPolicy(str, Enum): PAUSE_FOR_RESTART = "PauseForRestart" -class StorageControllerLeadershipStatus(str, Enum): +class StorageControllerLeadershipStatus(StrEnum): LEADER = "leader" STEPPED_DOWN = "stepped_down" CANDIDATE = "candidate" @@ -1602,16 +1594,16 @@ class NeonStorageController(MetricsGetter, LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: if self.running: self.env.neon_cli.storage_controller_stop(immediate) self.running = False @@ -1673,7 +1665,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return resp - def headers(self, scope: Optional[TokenScope]) -> dict[str, str]: + def headers(self, scope: TokenScope | None) -> dict[str, str]: headers = {} if self.auth_enabled and scope is not None: jwt_token = self.env.auth_keys.generate_token(scope=scope) @@ -1711,9 +1703,9 @@ class NeonStorageController(MetricsGetter, LogUtils): def attach_hook_issue( self, - tenant_shard_id: Union[TenantId, TenantShardId], + tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, - generation_override: Optional[int] = None, + generation_override: int | None = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: @@ -1729,7 +1721,7 @@ class NeonStorageController(MetricsGetter, LogUtils): assert isinstance(gen, int) return gen - def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): + def attach_hook_drop(self, tenant_shard_id: TenantId | TenantShardId): self.request( "POST", f"{self.api}/debug/v1/attach-hook", @@ -1737,7 +1729,7 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) - def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: + def inspect(self, tenant_shard_id: TenantId | TenantShardId) -> tuple[int, int] | None: """ :return: 2-tuple of (generation, pageserver id), or None if unknown """ @@ -1857,10 +1849,10 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_create( self, tenant_id: TenantId, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - tenant_config: Optional[dict[Any, Any]] = None, - placement_policy: Optional[Union[dict[Any, Any], str]] = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + tenant_config: dict[Any, Any] | None = None, + placement_policy: dict[Any, Any] | str | None = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -1891,6 +1883,20 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"tenant_create success: {response.json()}") + def timeline_create( + self, + tenant_id: TenantId, + body: dict[str, Any], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"timeline_create success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} @@ -1941,7 +1947,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return response.json() def tenant_shard_split( - self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: int | None = None ) -> list[TenantShardId]: response = self.request( "PUT", @@ -2039,8 +2045,8 @@ class NeonStorageController(MetricsGetter, LogUtils): def poll_node_status( self, node_id: int, - desired_availability: Optional[PageserverAvailability], - desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + desired_availability: PageserverAvailability | None, + desired_scheduling_policy: PageserverSchedulingPolicy | None, max_attempts: int, backoff: float, ): @@ -2259,7 +2265,7 @@ class NeonStorageController(MetricsGetter, LogUtils): json=body, ) - def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + def get_safekeeper(self, id: int) -> dict[str, Any] | None: try: response = self.request( "GET", @@ -2285,14 +2291,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def __enter__(self) -> NeonStorageController: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2304,10 +2310,10 @@ class NeonProxiedStorageController(NeonStorageController): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert instance_id is not None and base_port is not None self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) @@ -2317,7 +2323,7 @@ class NeonProxiedStorageController(NeonStorageController): return self def stop_instance( - self, immediate: bool = False, instance_id: Optional[int] = None + self, immediate: bool = False, instance_id: int | None = None ) -> NeonStorageController: assert instance_id in self.instances if self.instances[instance_id]["running"]: @@ -2327,7 +2333,7 @@ class NeonProxiedStorageController(NeonStorageController): self.running = any(meta["running"] for meta in self.instances.values()) return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: for iid, details in self.instances.items(): if details["running"]: self.env.neon_cli.storage_controller_stop(immediate, iid) @@ -2346,7 +2352,7 @@ class NeonProxiedStorageController(NeonStorageController): def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: raise NotImplementedError() @@ -2393,8 +2399,8 @@ class NeonPageserver(PgProtocol, LogUtils): def timeline_dir( self, - tenant_shard_id: Union[TenantId, TenantShardId], - timeline_id: Optional[TimelineId] = None, + tenant_shard_id: TenantId | TenantShardId, + timeline_id: TimelineId | None = None, ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: @@ -2403,7 +2409,7 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_dir( self, - tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, + tenant_shard_id: TenantId | TenantShardId | None = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" if tenant_shard_id is None: @@ -2447,9 +2453,9 @@ class NeonPageserver(PgProtocol, LogUtils): def start( self, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, - ) -> NeonPageserver: + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, + ) -> Self: """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -2484,7 +2490,7 @@ class NeonPageserver(PgProtocol, LogUtils): return self - def stop(self, immediate: bool = False) -> NeonPageserver: + def stop(self, immediate: bool = False) -> Self: """ Stop the page server. Returns self. @@ -2497,7 +2503,7 @@ class NeonPageserver(PgProtocol, LogUtils): def restart( self, immediate: bool = False, - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ): """ High level wrapper for restart: restarts the process, and waits for @@ -2532,14 +2538,14 @@ class NeonPageserver(PgProtocol, LogUtils): wait_until(20, 0.5, complete) - def __enter__(self) -> NeonPageserver: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2548,7 +2554,7 @@ class NeonPageserver(PgProtocol, LogUtils): pytest.skip("pageserver was built without 'testing' feature") def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, @@ -2585,7 +2591,7 @@ class NeonPageserver(PgProtocol, LogUtils): self, tenant_id: TenantId, config: None | dict[str, Any] = None, - generation: Optional[int] = None, + generation: int | None = None, override_storage_controller_generation: bool = False, ): """ @@ -2619,7 +2625,7 @@ class NeonPageserver(PgProtocol, LogUtils): return client.tenant_location_conf(tenant_id, config, **kwargs) def read_tenant_location_conf( - self, tenant_shard_id: Union[TenantId, TenantShardId] + self, tenant_shard_id: TenantId | TenantShardId ) -> dict[str, Any]: path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") @@ -2634,9 +2640,9 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_create( self, tenant_id: TenantId, - conf: Optional[dict[str, Any]] = None, - auth_token: Optional[str] = None, - generation: Optional[int] = None, + conf: dict[str, Any] | None = None, + auth_token: str | None = None, + generation: int | None = None, ) -> TenantId: if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) @@ -2656,7 +2662,7 @@ class NeonPageserver(PgProtocol, LogUtils): return tenant_id def list_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ Inspect local storage on a pageserver to discover which layer files are present. @@ -2749,7 +2755,7 @@ class PgBin: if "/" not in str(command[0]): command[0] = str(self.pg_bin_path / command[0]) - def _build_env(self, env_add: Optional[Env]) -> Env: + def _build_env(self, env_add: Env | None) -> Env: if env_add is None: return self.env env = self.env.copy() @@ -2766,8 +2772,8 @@ class PgBin: def run_nonblocking( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> subprocess.Popen[Any]: """ Run one of the postgres binaries, not waiting for it to finish @@ -2790,8 +2796,8 @@ class PgBin: def run( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> None: """ Run one of the postgres binaries, waiting for it to finish @@ -2813,8 +2819,8 @@ class PgBin: def run_capture( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, + env: Env | None = None, + cwd: str | None = None, with_command_header=True, **popen_kwargs: Any, ) -> str: @@ -2941,7 +2947,7 @@ class VanillaPostgres(PgProtocol): conf_file.write("\n".join(hba) + "\n") conf_file.write(data) - def start(self, log_path: Optional[str] = None): + def start(self, log_path: str | None = None): assert not self.running self.running = True @@ -2960,14 +2966,14 @@ class VanillaPostgres(PgProtocol): """Return size of pgdatadir subdirectory in bytes.""" return get_dir_size(self.pgdatadir / subdir) - def __enter__(self) -> VanillaPostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self.running: self.stop() @@ -3009,14 +3015,14 @@ class RemotePostgres(PgProtocol): # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception("cannot get size of a Postgres instance") - def __enter__(self) -> RemotePostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): # do nothing pass @@ -3092,7 +3098,7 @@ class PSQL: self.path = full_path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" - async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: + async def run(self, query: str | None = None) -> asyncio.subprocess.Process: run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] if query is not None: run_args += ["--command", query] @@ -3138,7 +3144,7 @@ class NeonProxy(PgProtocol): """All auth backends must inherit from this class""" @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return None @abc.abstractmethod @@ -3155,7 +3161,7 @@ class NeonProxy(PgProtocol): ] class Console(AuthBackend): - def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None): + def __init__(self, endpoint: str, fixed_rate_limit: int | None = None): self.endpoint = endpoint self.fixed_rate_limit = fixed_rate_limit @@ -3183,7 +3189,7 @@ class NeonProxy(PgProtocol): pg_conn_url: str @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return self.pg_conn_url def extra_args(self) -> list[str]: @@ -3202,8 +3208,8 @@ class NeonProxy(PgProtocol): mgmt_port: int, external_http_port: int, auth_backend: NeonProxy.AuthBackend, - metric_collection_endpoint: Optional[str] = None, - metric_collection_interval: Optional[str] = None, + metric_collection_endpoint: str | None = None, + metric_collection_interval: str | None = None, ): host = "127.0.0.1" domain = "proxy.localtest.me" # resolves to 127.0.0.1 @@ -3221,9 +3227,9 @@ class NeonProxy(PgProtocol): self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonProxy: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3351,14 +3357,14 @@ class NeonProxy(PgProtocol): log.info(f"SUCCESS, found auth url: {line}") return line - def __enter__(self) -> NeonProxy: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3439,9 +3445,9 @@ class NeonAuthBroker: self.mgmt_port = mgmt_port self.auth_backend = auth_backend self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonAuthBroker: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3510,14 +3516,14 @@ class NeonAuthBroker: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") return request_result.text - def __enter__(self) -> NeonAuthBroker: + def __enter__(self) -> Self: return self def __exit__( self, - _exc_type: Optional[type[BaseException]], - _exc_value: Optional[BaseException], - _traceback: Optional[TracebackType], + _exc_type: type[BaseException] | None, + _exc_value: BaseException | None, + _traceback: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3673,9 +3679,9 @@ class Endpoint(PgProtocol, LogUtils): ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.branch_name: Optional[str] = None # dubious - self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA + self.branch_name: str | None = None # dubious + self.endpoint_id: str | None = None # dubious, see asserts below + self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -3692,7 +3698,7 @@ class Endpoint(PgProtocol, LogUtils): self._running = threading.Semaphore(0) def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( port=self.http_port, @@ -3701,13 +3707,13 @@ class Endpoint(PgProtocol, LogUtils): def create( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - ) -> Endpoint: + ) -> Self: """ Create a new Postgres endpoint. Returns self. @@ -3748,12 +3754,12 @@ class Endpoint(PgProtocol, LogUtils): def start( self, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Start the Postgres instance. Returns self. @@ -3800,7 +3806,7 @@ class Endpoint(PgProtocol, LogUtils): """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" return self.endpoint_path() / "postgresql.conf" - def config(self, lines: list[str]) -> Endpoint: + def config(self, lines: list[str]) -> Self: """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. @@ -3828,9 +3834,7 @@ class Endpoint(PgProtocol, LogUtils): def is_running(self): return self._running._value > 0 - def reconfigure( - self, pageserver_id: Optional[int] = None, safekeepers: Optional[list[int]] = None - ): + def reconfigure(self, pageserver_id: int | None = None, safekeepers: list[int] | None = None): assert self.endpoint_id is not None # If `safekeepers` is not None, they are remember them as active and use # in the following commands. @@ -3877,8 +3881,8 @@ class Endpoint(PgProtocol, LogUtils): def stop( self, mode: str = "fast", - sks_wait_walreceiver_gone: Optional[tuple[list[Safekeeper], TimelineId]] = None, - ) -> Endpoint: + sks_wait_walreceiver_gone: tuple[list[Safekeeper], TimelineId] | None = None, + ) -> Self: """ Stop the Postgres instance if it's running. @@ -3912,7 +3916,7 @@ class Endpoint(PgProtocol, LogUtils): return self - def stop_and_destroy(self, mode: str = "immediate") -> Endpoint: + def stop_and_destroy(self, mode: str = "immediate") -> Self: """ Stop the Postgres instance, then destroy the endpoint. Returns self. @@ -3931,15 +3935,15 @@ class Endpoint(PgProtocol, LogUtils): def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Create an endpoint, apply config, and start Postgres. Returns self. @@ -3962,14 +3966,14 @@ class Endpoint(PgProtocol, LogUtils): return self - def __enter__(self) -> Endpoint: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -3980,7 +3984,7 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 - def clear_shared_buffers(self, cursor: Optional[Any] = None): + def clear_shared_buffers(self, cursor: Any | None = None): """ Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' @@ -4003,14 +4007,14 @@ class EndpointFactory: def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - basebackup_request_tries: Optional[int] = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + basebackup_request_tries: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4035,12 +4039,12 @@ class EndpointFactory: def create( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4063,7 +4067,7 @@ class EndpointFactory: pageserver_id=pageserver_id, ) - def stop_all(self, fail_on_error=True) -> EndpointFactory: + def stop_all(self, fail_on_error=True) -> Self: exception = None for ep in self.endpoints: try: @@ -4078,7 +4082,7 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4094,7 +4098,7 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4132,7 +4136,7 @@ class Safekeeper(LogUtils): port: SafekeeperPort, id: int, running: bool = False, - extra_opts: Optional[list[str]] = None, + extra_opts: list[str] | None = None, ): self.env = env self.port = port @@ -4158,8 +4162,8 @@ class Safekeeper(LogUtils): self.extra_opts = extra_opts def start( - self, extra_opts: Optional[list[str]] = None, timeout_in_seconds: Optional[int] = None - ) -> Safekeeper: + self, extra_opts: list[str] | None = None, timeout_in_seconds: int | None = None + ) -> Self: if extra_opts is None: # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. extra_opts = self.extra_opts @@ -4194,7 +4198,7 @@ class Safekeeper(LogUtils): break # success return self - def stop(self, immediate: bool = False) -> Safekeeper: + def stop(self, immediate: bool = False) -> Self: self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -4238,7 +4242,7 @@ class Safekeeper(LogUtils): return res def http_client( - self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True + self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: """ When auth_token is None but gen_sk_wide is True creates safekeeper wide @@ -4371,14 +4375,14 @@ class NeonBroker(LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_broker_start(timeout_in_seconds) self.running = True return self - def stop(self): + def stop(self) -> Self: if self.running: self.env.neon_cli.storage_broker_stop() self.running = False @@ -4394,8 +4398,7 @@ class NeonBroker(LogUtils): assert_no_errors(self.logfile, "storage_controller", []) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class NodeKind(str, Enum): +class NodeKind(StrEnum): PAGESERVER = "pageserver" SAFEKEEPER = "safekeeper" @@ -4406,7 +4409,7 @@ class StorageScrubber: self.log_dir = log_dir def scrubber_cli( - self, args: list[str], timeout, extra_env: Optional[dict[str, str]] = None + self, args: list[str], timeout, extra_env: dict[str, str] | None = None ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -4469,8 +4472,8 @@ class StorageScrubber: self, post_to_storage_controller: bool = False, node_kind: NodeKind = NodeKind.PAGESERVER, - timeline_lsns: Optional[list[dict[str, Any]]] = None, - extra_env: Optional[dict[str, str]] = None, + timeline_lsns: list[dict[str, Any]] | None = None, + extra_env: dict[str, str] | None = None, ) -> tuple[bool, Any]: """ Returns the health status and the metadata summary. @@ -4504,8 +4507,8 @@ class StorageScrubber: def pageserver_physical_gc( self, min_age_secs: int, - tenant_ids: Optional[list[TenantId]] = None, - mode: Optional[str] = None, + tenant_ids: list[TenantId] | None = None, + mode: str | None = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4619,7 +4622,7 @@ def check_restored_datadir_content( test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, - ignored_files: Optional[list[str]] = None, + ignored_files: list[str] | None = None, ): pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -4721,7 +4724,7 @@ def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> L def tenant_get_shards( - env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None + env: NeonEnv, tenant_id: TenantId, pageserver_id: int | None = None ) -> list[tuple[TenantShardId, NeonPageserver]]: """ Helper for when you want to talk to one or more pageservers, and the @@ -4784,8 +4787,8 @@ def wait_for_last_flush_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" @@ -4814,7 +4817,7 @@ def flush_ep_to_pageserver( ep: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """ Stop endpoint and wait until all committed WAL reaches the pageserver @@ -4857,7 +4860,7 @@ def wait_for_wal_insert_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) @@ -4878,7 +4881,7 @@ def fork_at_current_lsn( endpoint: Endpoint, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[TenantId] = None, + tenant_id: TenantId | None = None, ) -> TimelineId: """ Create new branch at the last LSN of an existing branch. @@ -4951,8 +4954,9 @@ def last_flush_lsn_upload( endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, + wait_until_uploaded: bool = True, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, @@ -4966,7 +4970,9 @@ def last_flush_lsn_upload( for tenant_shard_id, pageserver in shards: ps_http = pageserver.http_client(auth_token=auth_token) wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) - ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint( + tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded + ) return last_flush_lsn @@ -4987,10 +4993,11 @@ def generate_uploads_and_deletions( env: NeonEnv, *, init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + data: str | None = None, pageserver: NeonPageserver, + wait_until_uploaded: bool = True, ): """ Using the environment's default tenant + timeline, generate a load pattern @@ -5013,7 +5020,12 @@ def generate_uploads_and_deletions( if init: endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) def churn(data): @@ -5036,7 +5048,12 @@ def generate_uploads_and_deletions( # in a state where there are "future layers" in remote storage that will generate deletions # after a restart. last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) # Compaction should generate some GC-elegible layers @@ -5052,4 +5069,4 @@ def generate_uploads_and_deletions( # background ingest, no more uploads pending, and therefore no non-determinism # in subsequent actions like pageserver restarts. flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index d05704c8e0..5059039678 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -25,8 +25,14 @@ def scan_pageserver_log_for_errors( # It's an ERROR or WARN. Is it in the allow-list? for a in allowed_errors: - if re.match(a, line): - break + try: + if re.match(a, line): + break + # We can switch `re.error` with `re.PatternError` after 3.13 + # https://docs.python.org/3/library/re.html#re.PatternError + except re.error: + print(f"Invalid regex: '{a}'", file=sys.stderr) + raise else: errors.append((lineno, line)) return errors diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py index 2319701e0b..0e068db593 100644 --- a/test_runner/fixtures/pageserver/common_types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -2,7 +2,7 @@ from __future__ import annotations import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @@ -46,7 +46,7 @@ class DeltaLayerName: return ret -LayerName = Union[ImageLayerName, DeltaLayerName] +LayerName = ImageLayerName | DeltaLayerName class InvalidFileName(Exception): diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d1a9b5921a..4cf3ece396 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,24 +1,32 @@ from __future__ import annotations +import dataclasses +import json +import random +import string import time from collections import defaultdict from dataclasses import dataclass from datetime import datetime -from typing import TYPE_CHECKING, Any +from typing import Any import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId +from fixtures.common_types import ( + Id, + Lsn, + TenantId, + TenantShardId, + TimelineArchivalState, + TimelineId, +) from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion from fixtures.utils import Fn -if TYPE_CHECKING: - from typing import Optional, Union - class PageserverApiException(Exception): def __init__(self, message, status_code: int): @@ -27,6 +35,69 @@ class PageserverApiException(Exception): self.status_code = status_code +@dataclass +class ImportPgdataIdemptencyKey: + key: str + + @staticmethod + def random() -> ImportPgdataIdemptencyKey: + return ImportPgdataIdemptencyKey( + "".join(random.choices(string.ascii_letters + string.digits, k=20)) + ) + + +@dataclass +class LocalFs: + path: str + + +@dataclass +class AwsS3: + region: str + bucket: str + key: str + + +@dataclass +class ImportPgdataLocation: + LocalFs: None | LocalFs = None + AwsS3: None | AwsS3 = None + + +@dataclass +class TimelineCreateRequestModeImportPgdata: + location: ImportPgdataLocation + idempotency_key: ImportPgdataIdemptencyKey + + +@dataclass +class TimelineCreateRequestMode: + Branch: None | dict[str, Any] = None + Bootstrap: None | dict[str, Any] = None + ImportPgdata: None | TimelineCreateRequestModeImportPgdata = None + + +@dataclass +class TimelineCreateRequest: + new_timeline_id: TimelineId + mode: TimelineCreateRequestMode + + def to_json(self) -> str: + class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o) and not isinstance(o, type): + return dataclasses.asdict(o) + elif isinstance(o, Id): + return o.id.hex() + return super().default(o) + + # mode is flattened + this = dataclasses.asdict(self) + mode = this.pop("mode") + this.update(mode) + return json.dumps(self, cls=EnhancedJSONEncoder) + + class TimelineCreate406(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 406 @@ -43,7 +114,7 @@ class TimelineCreate409(PageserverApiException): class InMemoryLayerInfo: kind: str lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None @classmethod def from_json(cls, d: dict[str, Any]) -> InMemoryLayerInfo: @@ -60,10 +131,10 @@ class HistoricLayerInfo: layer_file_name: str layer_file_size: int lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer - l0: Optional[bool] + l0: bool | None visible: bool @classmethod @@ -180,8 +251,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self, port: int, is_testing_enabled_or_skip: Fn, - auth_token: Optional[str] = None, - retries: Optional[Retry] = None, + auth_token: str | None = None, + retries: Retry | None = None, ): super().__init__() self.port = port @@ -278,7 +349,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_attach( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, generation: int, config: None | dict[str, Any] = None, ): @@ -305,7 +376,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): }, ) - def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): + def tenant_reset(self, tenant_id: TenantId | TenantShardId, drop_cache: bool): params = {} if drop_cache: params["drop_cache"] = "true" @@ -315,10 +386,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_location_conf( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, location_conf: dict[str, Any], flush_ms=None, - lazy: Optional[bool] = None, + lazy: bool | None = None, ): body = location_conf.copy() @@ -346,20 +417,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json["tenant_shards"], list) return res_json - def tenant_get_location(self, tenant_id: TenantShardId): + def tenant_get_location(self, tenant_id: TenantId | TenantShardId): res = self.get( f"http://localhost:{self.port}/v1/location_config/{tenant_id}", ) self.verbose_error(res) return res.json() - def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_delete(self, tenant_id: TenantId | TenantShardId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res def tenant_status( - self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + self, tenant_id: TenantId | TenantShardId, activate: bool = False ) -> dict[Any, Any]: """ :activate: hint the server not to accelerate activation of this tenant in response @@ -378,17 +449,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig: + def tenant_config(self, tenant_id: TenantId | TenantShardId) -> TenantConfig: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") self.verbose_error(res) return TenantConfig.from_json(res.json()) - def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_heatmap_upload(self, tenant_id: TenantId | TenantShardId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) def tenant_secondary_download( - self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + self, tenant_id: TenantId | TenantShardId, wait_ms: int | None = None ) -> tuple[int, dict[Any, Any]]: url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" if wait_ms is not None: @@ -397,13 +468,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) - def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_secondary_status(self, tenant_id: TenantId | TenantShardId): url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" res = self.get(url) self.verbose_error(res) return res.json() - def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): + def set_tenant_config(self, tenant_id: TenantId | TenantShardId, config: dict[str, Any]): """ Only use this via storage_controller.pageserver_api(). @@ -420,8 +491,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def patch_tenant_config_client_side( self, tenant_id: TenantId, - inserts: Optional[dict[str, Any]] = None, - removes: Optional[list[str]] = None, + inserts: dict[str, Any] | None = None, + removes: list[str] | None = None, ): """ Only use this via storage_controller.pageserver_api(). @@ -436,11 +507,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): del current[key] self.set_tenant_config(tenant_id, current) - def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int: + def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] def tenant_size_and_modelinputs( - self, tenant_id: Union[TenantId, TenantShardId] + self, tenant_id: TenantId | TenantShardId ) -> tuple[int, dict[str, Any]]: """ Returns the tenant size, together with the model inputs as the second tuple item. @@ -456,7 +527,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(inputs, dict) return (size, inputs) - def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str: + def tenant_size_debug(self, tenant_id: TenantId | TenantShardId) -> str: """ Returns the tenant size debug info, as an HTML string """ @@ -468,10 +539,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_time_travel_remote_storage( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timestamp: datetime, done_if_after: datetime, - shard_counts: Optional[list[int]] = None, + shard_counts: list[int] | None = None, ): """ Issues a request to perform time travel operations on the remote storage @@ -490,7 +561,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, ) -> list[dict[str, Any]]: @@ -510,7 +581,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_and_offloaded_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, ) -> TimelinesInfoAndOffloaded: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded", @@ -523,11 +594,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_create( self, pg_version: PgVersion, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, new_timeline_id: TimelineId, - ancestor_timeline_id: Optional[TimelineId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - existing_initdb_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: TimelineId | None = None, + ancestor_start_lsn: Lsn | None = None, + existing_initdb_timeline_id: TimelineId | None = None, **kwargs, ) -> dict[Any, Any]: body: dict[str, Any] = { @@ -558,7 +629,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_detail( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, @@ -584,7 +655,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_delete( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs ): """ Note that deletion is not instant, it is scheduled and performed mostly in the background. @@ -600,9 +671,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_gc( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - gc_horizon: Optional[int], + gc_horizon: int | None, ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -624,16 +695,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def timeline_block_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", ) log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) - def timeline_unblock_gc( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_unblock_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", ) @@ -642,7 +711,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_offload( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() @@ -658,13 +727,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_compact( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, enhanced_gc_bottom_most_compaction=False, + body: dict[str, Any] | None = None, ): self.is_testing_enabled_or_skip() query = {} @@ -683,6 +753,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact", params=query, + json=body, ) log.info(f"Got compact request response code: {res.status_code}") self.verbose_error(res) @@ -690,7 +761,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res_json is None def timeline_preserve_initdb_archive( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ): log.info( f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}" @@ -702,7 +773,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_archival_config( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, state: TimelineArchivalState, ): @@ -718,7 +789,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_get_lsn_by_timestamp( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, timestamp: datetime, with_lease: bool = False, @@ -737,7 +808,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_lsn_lease( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): data = { "lsn": str(lsn), @@ -753,7 +824,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_get_timestamp_of_lsn( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}") res = self.get( @@ -763,9 +834,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json - def timeline_layer_map_info( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_layer_map_info(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", @@ -776,13 +845,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_checkpoint( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, - compact: Optional[bool] = None, + compact: bool | None = None, **kwargs, ): self.is_testing_enabled_or_skip() @@ -799,7 +868,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if compact is not None: query["compact"] = "true" if compact else "false" - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + log.info( + f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}" + ) res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, @@ -812,7 +883,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_spawn_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, ) -> dict[str, Any]: @@ -831,7 +902,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_poll_download_remote_layers_status( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, spawn_response: dict[str, Any], poll_state=None, @@ -853,7 +924,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, errors_ok=False, @@ -903,7 +974,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[int]: + ) -> int | None: metrics = [ "pageserver_remote_timeline_client_calls_started_total", "pageserver_remote_timeline_client_calls_finished_total", @@ -927,7 +998,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def layer_map_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ) -> LayerMapInfo: res = self.get( @@ -937,7 +1008,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return LayerMapInfo.from_json(res.json()) def timeline_layer_scan_disposable_keys( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ) -> ScanDisposableKeysResponse: res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys", @@ -947,7 +1018,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return ScanDisposableKeysResponse.from_json(res.json()) def download_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -956,9 +1027,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code == 200 - def download_all_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def download_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: if not layer.remote: @@ -967,9 +1036,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def detach_ancestor( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - batch_size: Optional[int] = None, + batch_size: int | None = None, **kwargs, ) -> set[TimelineId]: params = {} @@ -985,7 +1054,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -994,7 +1063,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code in (200, 304) - def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def evict_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -1007,7 +1076,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return res.json() - def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_break(self, tenant_id: TenantId | TenantShardId): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) @@ -1056,7 +1125,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def perf_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 37b4246d40..b6d19af84c 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -13,7 +13,8 @@ from fixtures.neon_fixtures import ( from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any def single_timeline( diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index ac7497ee6c..46700e3fe3 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -17,14 +17,14 @@ from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def assert_tenant_state( pageserver_http: PageserverHttpClient, tenant: TenantId, expected_state: str, - message: Optional[str] = None, + message: str | None = None, ) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") @@ -33,7 +33,7 @@ def assert_tenant_state( def remote_consistent_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http.timeline_detail(tenant, timeline) @@ -51,7 +51,7 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ): @@ -138,7 +138,7 @@ def wait_until_all_tenants_state( def wait_until_timeline_state( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, expected_state: str, iterations: int, @@ -188,7 +188,7 @@ def wait_until_tenant_active( def last_record_lsn( pageserver_http_client: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -200,7 +200,7 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ) -> Lsn: @@ -267,10 +267,10 @@ def wait_for_upload_queue_empty( def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int, - interval: Optional[float] = None, + interval: float | None = None, ): if interval is None: interval = 0.25 @@ -292,10 +292,10 @@ def wait_timeline_detail_404( def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int = 20, - interval: Optional[float] = None, + interval: float | None = None, **delete_args, ) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) @@ -304,9 +304,9 @@ def timeline_delete_wait_completed( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, - allowed_postfix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, + allowed_postfix: str | None = None, delimiter: str = "/", ) -> None: assert remote_storage is not None @@ -348,8 +348,8 @@ def assert_prefix_empty( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_not_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, delimiter: str = "/", ): assert remote_storage is not None @@ -358,7 +358,7 @@ def assert_prefix_not_empty( def list_prefix( - remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/" + remote: RemoteStorage, prefix: str | None = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 1131bf090f..2c6adb8a33 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -11,7 +11,7 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any """ @@ -20,31 +20,31 @@ Dynamically parametrize tests by different parameters @pytest.fixture(scope="function", autouse=True) -def pg_version() -> Optional[PgVersion]: +def pg_version() -> PgVersion | None: return None @pytest.fixture(scope="function", autouse=True) -def build_type() -> Optional[str]: +def build_type() -> str | None: return None @pytest.fixture(scope="session", autouse=True) -def platform() -> Optional[str]: +def platform() -> str | None: return None @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_engine() -> Optional[str]: +def pageserver_virtual_file_io_engine() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_mode() -> Optional[str]: +def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") -def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: return None @@ -54,7 +54,7 @@ def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict @pytest.fixture(scope="function", autouse=True) -def pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: return get_pageserver_default_tenant_config_compaction_algorithm() @@ -66,6 +66,7 @@ def pytest_generate_tests(metafunc: Metafunc): metafunc.parametrize("build_type", build_types) + pg_versions: list[PgVersion] if (v := os.getenv("DEFAULT_PG_VERSION")) is None: pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] else: diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 60221573eb..1c71abea19 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -18,7 +18,6 @@ from fixtures.utils import allure_attach_from_dir if TYPE_CHECKING: from collections.abc import Iterator - from typing import Optional BASE_DIR = Path(__file__).parents[2] @@ -26,9 +25,7 @@ COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" DEFAULT_OUTPUT_DIR: str = "test_output" -def get_test_dir( - request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None -) -> Path: +def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path: """Compute the path to a working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" @@ -112,7 +109,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_neon_binpath() -> Iterator[Optional[Path]]: +def compatibility_neon_binpath() -> Iterator[Path | None]: if os.getenv("REMOTE_ENV"): return comp_binpath = None @@ -133,7 +130,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]: +def compatibility_pg_distrib_dir() -> Iterator[Path | None]: compat_distrib_dir = None if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): compat_distrib_dir = Path(env_compat_postgres_bin).resolve() @@ -197,7 +194,7 @@ class FileAndThreadLock: def __init__(self, path: Path): self.path = path self.thread_lock = threading.Lock() - self.fd: Optional[int] = None + self.fd: int | None = None def __enter__(self): self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) @@ -208,9 +205,9 @@ class FileAndThreadLock: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): assert self.fd is not None assert self.thread_lock.locked() # ... by us @@ -263,9 +260,9 @@ class SnapshotDir: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): self._lock.__exit__(exc_type, exc_value, exc_traceback) @@ -277,7 +274,7 @@ def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir: @pytest.fixture(scope="function") -def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path | None: """ Idempotently create a test's overlayfs mount state directory. If the functionality isn't enabled via env var, returns None. diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 4feab52c43..46423e8c76 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -1,29 +1,23 @@ from __future__ import annotations -import enum -from typing import TYPE_CHECKING +from enum import StrEnum from typing_extensions import override -if TYPE_CHECKING: - from typing import Optional - - """ This fixture is used to determine which version of Postgres to use for tests. """ # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument -# TODO: use enum.StrEnum for Python >= 3.11 -class PgVersion(str, enum.Enum): +class PgVersion(StrEnum): V14 = "14" V15 = "15" V16 = "16" V17 = "17" - # Default Postgres Version for tests that don't really depend on Postgres itself - DEFAULT = V16 + # Postgres Version for tests that uses `fixtures.utils.run_only_on_default_postgres` + DEFAULT = V17 # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) @@ -34,7 +28,6 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" - # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums @override def __str__(self) -> str: return self.value @@ -47,16 +40,18 @@ class PgVersion(str, enum.Enum): @classmethod @override - def _missing_(cls, value: object) -> Optional[PgVersion]: - known_values = {v.value for _, v in cls.__members__.items()} + def _missing_(cls, value: object) -> PgVersion | None: + if not isinstance(value, str): + return None - # Allow passing version as a string with "v" prefix (e.g. "v14") - if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values: - return cls(value[1:]) - # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15) - elif isinstance(value, int) and str(value)[:2] in known_values: - return cls(str(value)[:2]) + known_values = set(cls.__members__.values()) + + # Allow passing version as v-prefixed string (e.g. "v14") + if value.lower().startswith("v") and (v := value[1:]) in known_values: + return cls(v) + + # Allow passing version as an int (i.e. both "15" and "150002" matches PgVersion.V15) + if value.isdigit() and (v := value[:2]) in known_values: + return cls(v) - # Make mypy happy - # See https://github.com/python/mypy/issues/3974 return None diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py index df0eb2a809..6a829a9399 100644 --- a/test_runner/fixtures/port_distributor.py +++ b/test_runner/fixtures/port_distributor.py @@ -3,13 +3,9 @@ from __future__ import annotations import re import socket from contextlib import closing -from typing import TYPE_CHECKING from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Union - def can_bind(host: str, port: int) -> bool: """ @@ -49,17 +45,19 @@ class PortDistributor: "port range configured for test is exhausted, consider enlarging the range" ) - def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + def replace_with_new_port(self, value: int | str) -> int | str: """ Returns a new port for a port number in a string (like "localhost:1234") or int. Replacements are memorised, so a substitution for the same port is always the same. """ - # TODO: replace with structural pattern matching for Python >= 3.10 - if isinstance(value, int): - return self._replace_port_int(value) - - return self._replace_port_str(value) + match value: + case int(): + return self._replace_port_int(value) + case str(): + return self._replace_port_str(value) + case _: + raise TypeError(f"Unsupported type {type(value)}, should be int | str") def _replace_port_int(self, value: int) -> int: known_port = self.port_map.get(value) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 7024953661..4e1e8a884f 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -6,8 +6,9 @@ import json import os import re from dataclasses import dataclass +from enum import StrEnum from pathlib import Path -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import boto3 import toml @@ -20,7 +21,7 @@ from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" @@ -28,7 +29,7 @@ TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @enum.unique -class RemoteStorageUser(str, enum.Enum): +class RemoteStorageUser(StrEnum): """ Instead of using strings for the users, use a more strict enum. """ @@ -77,19 +78,19 @@ class MockS3Server: class LocalFsStorage: root: Path - def tenant_path(self, tenant_id: TenantId) -> Path: + def tenant_path(self, tenant_id: TenantId | TenantShardId) -> Path: return self.root / "tenants" / str(tenant_id) - def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def timeline_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) def timeline_latest_generation( - self, tenant_id: TenantId, timeline_id: TimelineId - ) -> Optional[int]: + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId + ) -> int | None: timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id)) index_parts = [f for f in timeline_files if f.startswith("index_part")] - def parse_gen(filename: str) -> Optional[int]: + def parse_gen(filename: str) -> int | None: log.info(f"parsing index_part '{filename}'") parts = filename.split("-") if len(parts) == 2: @@ -102,7 +103,7 @@ class LocalFsStorage: raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}") return generations[-1] - def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def index_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: latest_gen = self.timeline_latest_generation(tenant_id, timeline_id) if latest_gen is None: filename = TIMELINE_INDEX_PART_FILE_NAME @@ -116,7 +117,7 @@ class LocalFsStorage: tenant_id: TenantId, timeline_id: TimelineId, local_name: str, - generation: Optional[int] = None, + generation: int | None = None, ): if generation is None: generation = self.timeline_latest_generation(tenant_id, timeline_id) @@ -126,7 +127,7 @@ class LocalFsStorage: filename = f"{local_name}-{generation:08x}" return self.timeline_path(tenant_id, timeline_id) / filename - def index_content(self, tenant_id: TenantId, timeline_id: TimelineId) -> Any: + def index_content(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Any: with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) @@ -158,17 +159,17 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: Optional[str] - secret_key: Optional[str] - aws_profile: Optional[str] + access_key: str | None + secret_key: str | None + aws_profile: str | None prefix_in_bucket: str client: S3Client cleanup: bool """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool - endpoint: Optional[str] = None + endpoint: str | None = None """formatting deserialized with humantime crate, for example "1s".""" - custom_timeout: Optional[str] = None + custom_timeout: str | None = None def access_env_vars(self) -> dict[str, str]: if self.aws_profile is not None: @@ -266,12 +267,10 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: + def tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: return f"{self.tenants_path()}/{tenant_id}" - def timeline_path( - self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId - ) -> str: + def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" def get_latest_index_key(self, index_keys: list[str]) -> str: @@ -309,11 +308,11 @@ class S3Storage: assert self.real is False -RemoteStorage = Union[LocalFsStorage, S3Storage] +RemoteStorage = LocalFsStorage | S3Storage @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" @@ -325,8 +324,8 @@ class RemoteStorageKind(str, enum.Enum): run_id: str, test_name: str, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: if self == RemoteStorageKind.LOCAL_FS: return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user)) diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 5d9a3bd149..094188c0b5 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -13,7 +13,7 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # Walreceiver as returned by sk's timeline status endpoint. @@ -72,7 +72,7 @@ class TermBumpResponse: class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + def __init__(self, port: int, auth_token: str | None = None, is_testing_enabled=False): super().__init__() self.port = port self.auth_token = auth_token @@ -98,7 +98,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): if not self.is_testing_enabled: pytest.skip("safekeeper was built without 'testing' feature") - def configure_failpoints(self, config_strings: Union[tuple[str, str], list[tuple[str, str]]]): + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]): self.is_testing_enabled_or_skip() if isinstance(config_strings, tuple): @@ -195,7 +195,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def debug_dump(self, params: Optional[dict[str, str]] = None) -> dict[str, Any]: + def debug_dump(self, params: dict[str, str] | None = None) -> dict[str, Any]: params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) res.raise_for_status() @@ -204,7 +204,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def debug_dump_timeline( - self, timeline_id: TimelineId, params: Optional[dict[str, str]] = None + self, timeline_id: TimelineId, params: dict[str, str] | None = None ) -> Any: params = params or {} params["timeline_id"] = str(timeline_id) @@ -285,7 +285,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): self, tenant_id: TenantId, timeline_id: TimelineId, - term: Optional[int], + term: int | None, ) -> TermBumpResponse: body = {} if term is not None: diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py index c174358ef5..be95a98ff9 100644 --- a/test_runner/fixtures/storage_controller_proxy.py +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -13,14 +13,14 @@ from werkzeug.wrappers.response import Response from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any class StorageControllerProxy: def __init__(self, server: HTTPServer): self.server: HTTPServer = server self.listen: str = f"http://{server.host}:{server.port}" - self.routing_to: Optional[str] = None + self.routing_to: str | None = None def route_to(self, storage_controller_api: str): self.routing_to = storage_controller_api diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 96a651f0db..30720e648d 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -8,10 +8,10 @@ import subprocess import tarfile import threading import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from hashlib import sha256 from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar from urllib.parse import urlencode import allure @@ -29,7 +29,7 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from collections.abc import Iterable - from typing import IO, Optional + from typing import IO from fixtures.common_types import TimelineId from fixtures.neon_fixtures import PgBin @@ -66,10 +66,10 @@ def subprocess_capture( echo_stderr: bool = False, echo_stdout: bool = False, capture_stdout: bool = False, - timeout: Optional[float] = None, + timeout: float | None = None, with_command_header: bool = True, **popen_kwargs: Any, -) -> tuple[str, Optional[str], int]: +) -> tuple[str, str | None, int]: """Run a process and bifurcate its output to files and the `log` logger stderr and stdout are always captured in files. They are also optionally @@ -495,8 +495,14 @@ def scan_log_for_errors(input: Iterable[str], allowed_errors: list[str]) -> list # It's an ERROR or WARN. Is it in the allow-list? for a in allowed_errors: - if re.match(a, line): - break + try: + if re.match(a, line): + break + # We can switch `re.error` with `re.PatternError` after 3.13 + # https://docs.python.org/3/library/re.html#re.PatternError + except re.error: + log.error(f"Invalid regex: '{a}'") + raise else: errors.append((lineno, line)) return errors @@ -530,7 +536,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str """ started_at = time.time() - def hash_extracted(reader: Optional[IO[bytes]]) -> bytes: + def hash_extracted(reader: IO[bytes] | None) -> bytes: assert reader is not None digest = sha256(usedforsecurity=False) while True: @@ -557,7 +563,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str mismatching: set[str] = set() - for left_tuple, right_tuple in zip(left_list, right_list): + for left_tuple, right_tuple in zip(left_list, right_list, strict=False): left_path, left_hash = left_tuple right_path, right_hash = right_tuple assert ( @@ -589,7 +595,7 @@ class PropagatingThread(threading.Thread): self.exc = e @override - def join(self, timeout: Optional[float] = None) -> Any: + def join(self, timeout: float | None = None) -> Any: super().join(timeout) if self.exc: raise self.exc @@ -668,6 +674,13 @@ def run_only_on_default_postgres(reason: str): ) +def run_only_on_postgres(versions: Iterable[PgVersion], reason: str): + return pytest.mark.skipif( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) not in versions, + reason=reason, + ) + + def skip_in_debug_build(reason: str): return pytest.mark.skipif( os.getenv("BUILD_TYPE", "debug") == "debug", diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index e869c43185..4c6b2b6b3e 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. @@ -36,8 +36,8 @@ class Workload: env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId, - branch_name: Optional[str] = None, - endpoint_opts: Optional[dict[str, Any]] = None, + branch_name: str | None = None, + endpoint_opts: dict[str, Any] | None = None, ): self.env = env self.tenant_id = tenant_id @@ -50,10 +50,10 @@ class Workload: self.expect_rows = 0 self.churn_cursor = 0 - self._endpoint: Optional[Endpoint] = None + self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} - def reconfigure(self): + def reconfigure(self) -> None: """ Request the endpoint to reconfigure based on location reported by storage controller """ @@ -61,7 +61,7 @@ class Workload: with ENDPOINT_LOCK: self._endpoint.reconfigure() - def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: + def endpoint(self, pageserver_id: int | None = None) -> Endpoint: # We may be running alongside other Workloads for different tenants. Full TTID is # obnoxiously long for use here, but a cut-down version is still unique enough for tests. endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" @@ -94,16 +94,17 @@ class Workload: def __del__(self): self.stop() - def init(self, pageserver_id: Optional[int] = None): + def init(self, pageserver_id: int | None = None, allow_recreate=False): endpoint = self.endpoint(pageserver_id) - + if allow_recreate: + endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};") endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") last_flush_lsn_upload( self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n: int, pageserver_id: Optional[int] = None, upload: bool = True): + def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -125,7 +126,7 @@ class Workload: return False def churn_rows( - self, n: int, pageserver_id: Optional[int] = None, upload: bool = True, ingest: bool = True + self, n: int, pageserver_id: int | None = None, upload: bool = True, ingest: bool = True ): assert self.expect_rows >= n @@ -190,7 +191,7 @@ class Workload: else: log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") - def validate(self, pageserver_id: Optional[int] = None): + def validate(self, pageserver_id: int | None = None): endpoint = self.endpoint(pageserver_id) endpoint.clear_shared_buffers() result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 227319c425..bcc3db69f0 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -16,7 +16,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_until_all_tenants_state if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -46,7 +47,7 @@ def setup_pageserver_with_tenants( name: str, n_tenants: int, setup: Callable[[NeonEnv], tuple[TenantId, TimelineId, dict[str, Any]]], - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ) -> NeonEnv: """ Utility function to set up a pageserver with a given number of identical tenants. diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 36090dcad7..680eb62b39 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -56,7 +56,7 @@ def test_bulk_insert(neon_with_baseline: PgCompare): def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() - pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"]) + pg_version = PgVersion(str(client.timeline_detail(env.tenant, env.timeline)["pg_version"])) # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index d571fab6b5..0e56fdc96f 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -2,7 +2,7 @@ from __future__ import annotations from contextlib import closing from io import BufferedReader, RawIOBase -from typing import Optional, final +from typing import final from fixtures.compare_fixtures import PgCompare from typing_extensions import override @@ -13,7 +13,7 @@ class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows self.rownum = 0 - self.linebuf: Optional[bytes] = None + self.linebuf: bytes | None = None self.ptr = 0 @override diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py new file mode 100644 index 0000000000..2f4574ba88 --- /dev/null +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -0,0 +1,267 @@ +import os +import re +import subprocess +import sys +import textwrap +from pathlib import Path +from typing import cast +from urllib.parse import urlparse + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.utils import humantime_to_ms + + +def setup_environment(): + """Set up necessary environment variables for pgcopydb execution. + + Expects the following variables to be set in the environment: + - PG_CONFIG: e.g. /tmp/neon/pg_install/v16/bin/pg_config + - PSQL: e.g. /tmp/neon/pg_install/v16/bin/psql + - PG_16_LIB_PATH: e.g. /tmp/neon/pg_install/v16/lib + - PGCOPYDB: e.g. /pgcopydb/bin/pgcopydb + - PGCOPYDB_LIB_PATH: e.g. /pgcopydb/lib + - BENCHMARK_INGEST_SOURCE_CONNSTR + - BENCHMARK_INGEST_TARGET_CONNSTR + - PERF_TEST_RESULT_CONNSTR + - TARGET_PROJECT_TYPE + + """ + # Ensure required environment variables are set + required_env_vars = [ + "PGCOPYDB", + "PGCOPYDB_LIB_PATH", + "PG_CONFIG", + "PSQL", + "PG_16_LIB_PATH", + "BENCHMARK_INGEST_SOURCE_CONNSTR", + "BENCHMARK_INGEST_TARGET_CONNSTR", + "PERF_TEST_RESULT_CONNSTR", + "TARGET_PROJECT_TYPE", + ] + for var in required_env_vars: + if not os.getenv(var): + raise OSError(f"Required environment variable '{var}' is not set.") + + +def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path): + """Builds the pgcopydb command to execute using existing environment variables.""" + pgcopydb_executable = os.getenv("PGCOPYDB") + if not pgcopydb_executable: + raise OSError("PGCOPYDB environment variable is not set.") + + return [ + pgcopydb_executable, + "clone", + "--dir", + str(test_output_dir), + "--skip-vacuum", + "--no-owner", + "--no-acl", + "--skip-db-properties", + "--table-jobs", + "4", + "--index-jobs", + "4", + "--restore-jobs", + "4", + "--split-tables-larger-than", + "10GB", + "--skip-extensions", + "--use-copy-binary", + "--filters", + str(pgcopydb_filter_file), + ] + + +@pytest.fixture() # must be function scoped because test_output_dir is function scoped +def pgcopydb_filter_file(test_output_dir: Path) -> Path: + """Creates the pgcopydb_filter.txt file required by pgcopydb.""" + filter_content = textwrap.dedent("""\ + [include-only-table] + public.events + public.emails + public.email_transmissions + public.payments + public.editions + public.edition_modules + public.sp_content + public.email_broadcasts + public.user_collections + public.devices + public.user_accounts + public.lessons + public.lesson_users + public.payment_methods + public.orders + public.course_emails + public.modules + public.users + public.module_users + public.courses + public.payment_gateway_keys + public.accounts + public.roles + public.payment_gateways + public.management + public.event_names + """) + filter_path = test_output_dir / "pgcopydb_filter.txt" + filter_path.write_text(filter_content) + return filter_path + + +def get_backpressure_time(connstr): + """Executes a query to get the backpressure throttling time in seconds.""" + query = "select backpressure_throttling_time()/1000000;" + psql_path = os.getenv("PSQL") + if psql_path is None: + raise OSError("The PSQL environment variable is not set.") + result = subprocess.run( + [psql_path, connstr, "-t", "-c", query], capture_output=True, text=True, check=True + ) + return float(result.stdout.strip()) + + +def run_command_and_log_output(command, log_file_path: Path): + """ + Runs a command and logs output to both a file and GitHub Actions console. + + Args: + command (list): The command to execute. + log_file_path (Path): Path object for the log file where output is written. + """ + # Define a list of necessary environment variables for pgcopydb + custom_env_vars = { + "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", + "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), + "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), + "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", + } + # Combine the current environment with custom variables + env = os.environ.copy() + env.update(custom_env_vars) + + with log_file_path.open("w") as log_file: + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env + ) + + assert process.stdout is not None, "process.stdout should not be None" + + # Stream output to both log file and console + for line in process.stdout: + print(line, end="") # Stream to GitHub Actions log + sys.stdout.flush() + log_file.write(line) # Write to log file + + process.wait() # Wait for the process to finish + if process.returncode != 0: + raise subprocess.CalledProcessError(process.returncode, command) + + +def parse_log_and_report_metrics( + zenbenchmark: NeonBenchmarker, log_file_path: Path, backpressure_time_diff: float +): + """Parses the pgcopydb log file for performance metrics and reports them to the database.""" + metrics = {"backpressure_time": backpressure_time_diff} + + # Define regex patterns to capture metrics + metric_patterns = { + "COPY_INDEX_CONSTRAINTS_VACUUM": re.compile( + r"COPY, INDEX, CONSTRAINTS, VACUUM \(wall clock\).*" + ), + "COPY_CUMULATIVE": re.compile(r"COPY \(cumulative\).*"), + "CREATE_INDEX_CUMULATIVE": re.compile(r"CREATE INDEX \(cumulative\).*"), + "CONSTRAINTS_CUMULATIVE": re.compile(r"CONSTRAINTS \(cumulative\).*"), + "FINALIZE_SCHEMA": re.compile(r"Finalize Schema.*"), + "TOTAL_DURATION": re.compile(r"Total Wall Clock Duration.*"), + } + + # Parse log file + with log_file_path.open("r") as log_file: + for line in log_file: + for metric_name, pattern in metric_patterns.items(): + if pattern.search(line): + # Extract duration and convert it to seconds + duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line) + if duration_match: + duration_str = duration_match.group(0) + parts = re.findall(r"\d+[a-zA-Z]+", duration_str) + rust_like_humantime = " ".join(parts) + duration_seconds = humantime_to_ms(rust_like_humantime) / 1000.0 + metrics[metric_name] = duration_seconds + + endpoint_id = {"endpoint_id": get_endpoint_id()} + for metric_name, duration_seconds in metrics.items(): + zenbenchmark.record( + metric_name, duration_seconds, "s", MetricReport.LOWER_IS_BETTER, endpoint_id + ) + + +def get_endpoint_id(): + """Extracts and returns the first segment of the hostname from the PostgreSQL URI stored in BENCHMARK_INGEST_TARGET_CONNSTR.""" + connstr = os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR") + if connstr is None: + raise OSError("BENCHMARK_INGEST_TARGET_CONNSTR environment variable is not set.") + + # Parse the URI + parsed_url = urlparse(connstr) + + # Extract the hostname and split to get the first segment + hostname = parsed_url.hostname + if hostname is None: + raise ValueError("Unable to parse hostname from BENCHMARK_INGEST_TARGET_CONNSTR") + + # Split the hostname by dots and take the first segment + endpoint_id = hostname.split(".")[0] + + return endpoint_id + + +@pytest.fixture() # must be function scoped because test_output_dir is function scoped +def log_file_path(test_output_dir): + """Fixture to provide a temporary log file path.""" + if not os.getenv("TARGET_PROJECT_TYPE"): + raise OSError("Required environment variable 'TARGET_PROJECT_TYPE' is not set.") + return (test_output_dir / os.getenv("TARGET_PROJECT_TYPE")).with_suffix(".log") + + +@pytest.mark.remote_cluster +def test_ingest_performance_using_pgcopydb( + zenbenchmark: NeonBenchmarker, + log_file_path: Path, + pgcopydb_filter_file: Path, + test_output_dir: Path, +): + """ + Simulate project migration from another PostgreSQL provider to Neon. + + Measure performance for Neon ingest steps + - COPY + - CREATE INDEX + - CREATE CONSTRAINT + - VACUUM ANALYZE + - create foreign keys + + Use pgcopydb to copy data from the source database to the destination database. + """ + # Set up environment and create filter file + setup_environment() + + # Get backpressure time before ingest + backpressure_time_before = get_backpressure_time(os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")) + + # Build and run the pgcopydb command + command = build_pgcopydb_command(pgcopydb_filter_file, test_output_dir) + try: + run_command_and_log_output(command, log_file_path) + except subprocess.CalledProcessError as e: + pytest.fail(f"pgcopydb command failed with error: {e}") + + # Get backpressure time after ingest and calculate the difference + backpressure_time_after = get_backpressure_time(os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")) + backpressure_time_diff = backpressure_time_after - backpressure_time_before + + # Parse log file and report metrics, including backpressure time difference + parse_log_and_report_metrics(zenbenchmark, log_file_path, backpressure_time_diff) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index d56f6dce09..38b04b9114 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -18,7 +18,7 @@ from fixtures.neon_api import connection_parameters_to_env from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.neon_api import NeonAPI @@ -247,7 +247,7 @@ def test_replication_start_stop( ], env=master_env, ) - replica_pgbench: list[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + replica_pgbench: list[subprocess.Popen[Any] | None] = [None] * num_replicas # Use the bits of iconfig to tell us which configuration we are on. For example # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index d2eba751f8..142bd3d669 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,7 +4,7 @@ import concurrent.futures import random import time from collections import defaultdict -from enum import Enum +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId @@ -16,7 +16,7 @@ from fixtures.neon_fixtures import ( PageserverAvailability, PageserverSchedulingPolicy, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion @@ -139,7 +139,7 @@ def test_storage_controller_many_tenants( tenant_timelines_count = 100 # These lists are maintained for use with rng.choice - tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_with_timelines = list(rng.sample(list(tenants.keys()), tenant_timelines_count)) tenants_without_timelines = list( tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines ) @@ -171,7 +171,7 @@ def test_storage_controller_many_tenants( # start timing on test nodes if we aren't a bit careful. create_concurrency = 16 - class Operation(str, Enum): + class Operation(StrEnum): TIMELINE_OPS = "timeline_ops" SHARD_MIGRATE = "shard_migrate" TENANT_PASSTHROUGH = "tenant_passthrough" @@ -273,7 +273,17 @@ def test_storage_controller_many_tenants( archival_state = rng.choice( [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED] ) - virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) + try: + virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) + except PageserverApiException as e: + if e.status_code == 404: + # FIXME: there is an edge case where timeline ops can encounter a 404 during + # a very short time window between generating a new generation number and + # attaching this tenant to its new pageserver. + # See https://github.com/neondatabase/neon/issues/9471 + pass + else: + raise # Generate a mixture of operations and dispatch them all concurrently futs = [] diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 576a4f0467..c6d795ce4d 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -17,7 +17,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any @pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7d19ba3b5d..5744c445f6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,7 +2,6 @@ from __future__ import annotations from collections.abc import Generator from dataclasses import dataclass -from typing import Optional import pytest from fixtures.common_types import TenantId @@ -105,7 +104,7 @@ def test_null_config(negative_env: NegativeTests): @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: str | None): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 370df3c379..f71e05924a 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,9 +1,8 @@ from __future__ import annotations -import enum import json import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.log_helper import log @@ -15,10 +14,6 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - AGGRESIVE_COMPACTION_TENANT_CONF = { # Disable gc and compaction. The test runs compaction manually. "gc_period": "0s", @@ -116,16 +111,64 @@ page_cache_size=10 assert vectored_average < 8 +def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 1000 + churn_rounds = 10 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run. + ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1 + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + ) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 @pytest.mark.parametrize( - "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)] + "shard_count,stripe_size,gc_compaction", + [ + (None, None, False), + (4, TINY_STRIPES, False), + (4, LARGE_STRIPES, False), + (4, LARGE_STRIPES, True), + ], ) def test_sharding_compaction( - neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int] + neon_env_builder: NeonEnvBuilder, + stripe_size: int, + shard_count: int | None, + gc_compaction: bool, ): """ Use small stripes, small layers, and small compaction thresholds to exercise how compaction @@ -217,8 +260,19 @@ def test_sharding_compaction( # Assert that everything is still readable workload.validate() + if gc_compaction: + # trigger gc compaction to get more coverage for that, piggyback on the existing workload + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + tenant_shard_id = shard["shard_id"] + pageserver.http_client().timeline_compact( + tenant_shard_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + ) -class CompactionAlgorithm(str, enum.Enum): + +class CompactionAlgorithm(StrEnum): LEGACY = "legacy" TIERED = "tiered" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 96ba3dd5a4..ba7305148f 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,7 +7,6 @@ import subprocess import tempfile from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import fixtures.utils import pytest @@ -28,10 +27,6 @@ from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. @@ -385,7 +380,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r def dump_differs( - first: Path, second: Path, output: Path, allowed_diffs: Optional[list[str]] = None + first: Path, second: Path, output: Path, allowed_diffs: list[str] | None = None ) -> bool: """ Runs diff(1) command on two SQL dumps and write the output to the given output file. diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index c5e3034591..1b15c5f15e 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,6 +3,7 @@ from __future__ import annotations import enum import os import shutil +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING, cast @@ -16,7 +17,7 @@ from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR if TYPE_CHECKING: from types import TracebackType - from typing import Optional, TypedDict, Union + from typing import Self, TypedDict from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion @@ -26,15 +27,15 @@ if TYPE_CHECKING: metric_name: str type: str help: str - key_labels: Optional[list[str]] - values: Optional[list[str]] - query: Optional[str] - query_ref: Optional[str] + key_labels: list[str] | None + values: list[str] | None + query: str | None + query_ref: str | None class Collector(TypedDict): collector_name: str metrics: list[Metric] - queries: Optional[list[Query]] + queries: list[Query] | None class Query(TypedDict): query_name: str @@ -53,12 +54,12 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: if not rel: raise RuntimeError("Empty filename") - full_path: Optional[str] = None + full_path: str | None = None if os.path.isabs(rel): full_path = rel else: for p in (dir, *JSONNET_PATH): - assert isinstance(p, (str, Path)), "for mypy" + assert isinstance(p, str | Path), "for mypy" full_path = os.path.join(p, rel) assert isinstance(full_path, str), "for mypy" @@ -82,9 +83,9 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: def jsonnet_evaluate_file( - jsonnet_file: Union[str, Path], - ext_vars: Optional[Union[str, dict[str, str]]] = None, - tla_vars: Optional[Union[str, dict[str, str]]] = None, + jsonnet_file: str | Path, + ext_vars: str | dict[str, str] | None = None, + tla_vars: str | dict[str, str] | None = None, ) -> str: return cast( "str", @@ -102,7 +103,7 @@ def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str: def evaluate_config( - jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str + jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str ) -> str: return jsonnet_evaluate_file( jsonnet_file, @@ -115,7 +116,7 @@ def evaluate_config( @enum.unique -class SqlExporterProcess(str, enum.Enum): +class SqlExporterProcess(StrEnum): COMPUTE = "compute" AUTOSCALING = "autoscaling" @@ -184,16 +185,16 @@ class SqlExporterRunner: def stop(self) -> None: raise NotImplementedError() - def __enter__(self) -> SqlExporterRunner: + def __enter__(self) -> Self: self.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -241,8 +242,7 @@ if SQL_EXPORTER is None: self.with_volume_mapping(str(config_file), container_config_file, "z") self.with_volume_mapping(str(collector_file), container_collector_file, "z") - @override - def start(self) -> SqlExporterContainer: + def start(self) -> Self: super().start() log.info("Waiting for sql_exporter to be ready") diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index e517e83e6f..1c5554c379 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -13,7 +13,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def handle_db(dbs, roles, operation): @@ -91,15 +91,15 @@ class DdlForwardingContext: lambda request: ddl_forward_handler(request, self.dbs, self.roles, self) ) - def __enter__(self): + def __enter__(self) -> Self: self.pg.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.pg.stop() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index c8d3b2ff3e..1807511008 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,6 +5,7 @@ import time from collections import Counter from collections.abc import Iterable from dataclasses import dataclass +from enum import StrEnum from typing import TYPE_CHECKING import pytest @@ -80,7 +81,7 @@ def test_min_resident_size_override_handling( @enum.unique -class EvictionOrder(str, enum.Enum): +class EvictionOrder(StrEnum): RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py new file mode 100644 index 0000000000..29229b73c1 --- /dev/null +++ b/test_runner/regress/test_import_pgdata.py @@ -0,0 +1,307 @@ +import json +import re +import time +from enum import Enum + +import psycopg2 +import psycopg2.errors +import pytest +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres +from fixtures.pageserver.http import ( + ImportPgdataIdemptencyKey, + PageserverApiException, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import run_only_on_postgres +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +num_rows = 1000 + + +class RelBlockSize(Enum): + ONE_STRIPE_SIZE = 1 + TWO_STRPES_PER_SHARD = 2 + MULTIPLE_RELATION_SEGMENTS = 3 + + +smoke_params = [ + # unsharded (the stripe size needs to be given for rel block size calculations) + *[(None, 1024, s) for s in RelBlockSize], + # many shards, small stripe size to speed up test + *[(8, 1024, s) for s in RelBlockSize], +] + + +@run_only_on_postgres( + [PgVersion.V14, PgVersion.V15, PgVersion.V16], + "newer control file catalog version and struct format isn't supported", +) +@pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params) +def test_pgdata_import_smoke( + vanilla_pg: VanillaPostgres, + neon_env_builder: NeonEnvBuilder, + shard_count: int | None, + stripe_size: int, + rel_block_size: RelBlockSize, + make_httpserver: HTTPServer, +): + # + # Setup fake control plane for import progress + # + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api" + } + ) + env.pageserver.stop() + env.pageserver.start() + + # + # Put data in vanilla pg + # + + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + + log.info("create relblock data") + if rel_block_size == RelBlockSize.ONE_STRIPE_SIZE: + target_relblock_size = stripe_size * 8192 + elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: + target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 + elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + else: + raise ValueError + + # fillfactor so we don't need to produce that much data + # 900 byte per row is > 10% => 1 row per page + vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""") + + nrows = 0 + while True: + relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") + log.info( + f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + ) + if relblock_size >= target_relblock_size: + break + addrows = int((target_relblock_size - relblock_size) // 8192) + assert addrows >= 1, "forward progress" + vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + nrows += addrows + expect_nrows = nrows + expect_sum = ( + (nrows) * (nrows + 1) // 2 + ) # https://stackoverflow.com/questions/43901484/sum-of-the-integers-from-1-to-n + + def validate_vanilla_equivalence(ep): + # TODO: would be nicer to just compare pgdump + assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ + (expect_nrows, expect_sum) + ] + + validate_vanilla_equivalence(vanilla_pg) + + vanilla_pg.stop() + + # + # We have a Postgres data directory now. + # Make a localfs remote storage that looks like how after `fast_import` ran. + # TODO: actually exercise fast_import here + # TODO: test s3 remote storage + # + importbucket = neon_env_builder.repo_dir / "importbucket" + importbucket.mkdir() + # what cplane writes before scheduling fast_import + specpath = importbucket / "spec.json" + specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"})) + # what fast_import writes + vanilla_pg.pgdatadir.rename(importbucket / "pgdata") + statusdir = importbucket / "status" + statusdir.mkdir() + (statusdir / "pgdata").write_text(json.dumps({"done": True})) + + # + # Do the import + # + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=shard_count, shard_stripe_size=stripe_size + ) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket.absolute())}}, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + shard_status_file = statusdir / f"shard-{shard_id.shard_index}" + if state == "Active": + shard_status_file_contents = ( + shard_status_file.read_text() + ) # Active state implies import is done + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(1) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + # + # Get some timeline details for later. + # + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_http = shard_zero_ps.http_client() + shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) + disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) + _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) + remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) + # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` + assert remote_consistent_lsn_visible == disk_consistent_lsn + assert initdb_lsn == latest_gc_cutoff_lsn + assert disk_consistent_lsn == initdb_lsn + 8 + assert last_record_lsn == disk_consistent_lsn + # TODO: assert these values are the same everywhere + + # + # Validate the resulting remote storage state. + # + + # + # Validate the imported data + # + + ro_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn + ) + + validate_vanilla_equivalence(ro_endpoint) + + # ensure the import survives restarts + ro_endpoint.stop() + env.pageserver.stop(immediate=True) + env.pageserver.start() + ro_endpoint.start() + validate_vanilla_equivalence(ro_endpoint) + + # + # validate the layer files in each shard only have the shard-specific data + # (the implementation would be functional but not efficient without this characteristic) + # + + shards = env.storage_controller.locate(tenant_id) + for shard in shards: + shard_ps = env.get_pageserver(shard["node_id"]) + result = shard_ps.timeline_scan_no_disposable_keys(shard["shard_id"], timeline_id) + assert result.tally.disposable_count == 0 + assert ( + result.tally.not_disposable_count > 0 + ), "sanity check, each shard should have some data" + + # + # validate that we can write + # + rw_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id + ) + rw_endpoint.safe_psql("create table othertable(values text)") + rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + + # TODO: consider using `class Workload` here + # to do compaction and whatnot? + + # + # validate that we can branch (important use case) + # + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable") + + # ... at the initdb lsn + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable") diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 2916748925..9c9bc5b519 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -2,16 +2,12 @@ from __future__ import annotations from collections.abc import Iterable from dataclasses import dataclass -from typing import TYPE_CHECKING from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo from fixtures.utils import human_bytes, skip_in_debug_build -if TYPE_CHECKING: - from typing import Union - @skip_in_debug_build("debug run is unnecessarily slow") def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @@ -109,14 +105,12 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @dataclass class Histogram: - buckets: list[Union[int, float]] + buckets: list[int | float] counts: list[int] sums: list[int] -def histogram_historic_layers( - infos: LayerMapInfo, minimum_sizes: list[Union[int, float]] -) -> Histogram: +def histogram_historic_layers(infos: LayerMapInfo, minimum_sizes: list[int | float]) -> Histogram: def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: log.info( f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" @@ -128,7 +122,7 @@ def histogram_historic_layers( return histogram(sizes, minimum_sizes) -def histogram(sizes: Iterable[int], minimum_sizes: list[Union[int, float]]) -> Histogram: +def histogram(sizes: Iterable[int], minimum_sizes: list[int | float]) -> Histogram: assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) buckets = list(enumerate(minimum_sizes)) counts = [0 for _ in buckets] diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 54ce7c8340..04ccec5875 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -99,11 +99,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv): res = client.metrics() info("Metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 2 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 @@ -116,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): try: res = client.metrics() timeout = -1 - if len(parse_metrics(res).query_all("installed_extensions")) < 4: + if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4: # Assume that not all metrics that are collected yet time.sleep(1) timeout -= 1 @@ -128,17 +132,21 @@ def test_installed_extensions(neon_simple_env: NeonEnv): continue assert ( - len(parse_metrics(res).query_all("installed_extensions")) >= 4 + len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4 ), "Not all metrics are collected" info("After restart metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 309e0f3015..761ec7568f 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -2,6 +2,7 @@ from __future__ import annotations import time +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver @@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import query_scalar, wait_until -def test_issue_5878(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "attach_mode", + ["default_generation", "same_generation"], +) +def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): """ Regression test for issue https://github.com/neondatabase/neon/issues/5878 . @@ -168,11 +173,32 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): tenant_conf = ps_http.tenant_config(tenant_id) generation_before_detach = get_generation_number() env.pageserver.tenant_detach(tenant_id) - failpoint_name = "before-delete-layer-pausable" + failpoint_deletion_queue = "deletion-queue-before-execute-pause" - ps_http.configure_failpoints((failpoint_name, "pause")) - env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) - generation_after_reattach = get_generation_number() + ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) + + if attach_mode == "default_generation": + env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) + elif attach_mode == "same_generation": + # Attach with the same generation number -- this is possible with timeline offload and detach ancestor + env.pageserver.tenant_attach( + tenant_id, + tenant_conf.tenant_specific_overrides, + generation=generation_before_detach, + # We want to avoid the generation bump and don't want to talk with the storcon + override_storage_controller_generation=False, + ) + else: + raise AssertionError(f"Unknown attach_mode: {attach_mode}") + + # Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon + # API if attach_mode == "same_generation" + tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id) + generation_after_reattach = tenant_location["generation"] + + if attach_mode == "same_generation": + # The generation number should be the same as before the detach + assert generation_before_detach == generation_after_reattach wait_until_tenant_active(ps_http, tenant_id) # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue. @@ -182,15 +208,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): wait_until(10, 0.5, future_layer_is_gone_from_index_part) - # NB: the layer file is unlinked index part now, but, because we made the delete - # operation stuck, the layer file itself is still in the remote_storage - wait_until( - 10, - 0.5, - lambda: env.pageserver.assert_log_contains( - f".*{tenant_id}.*at failpoint.*{failpoint_name}" - ), - ) + # We already make deletion stuck here, but we don't necessarily hit the failpoint + # because deletions are batched. future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) @@ -224,11 +243,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): break time.sleep(1) - # Window has passed, unstuck the delete, let upload queue drain. + # Window has passed, unstuck the delete, let deletion queue drain; the upload queue should + # have drained because we put these layer deletion operations into the deletion queue and + # have consumed the operation from the upload queue. log.info("unstuck the DELETE") - ps_http.configure_failpoints(("before-delete-layer-pausable", "off")) - + ps_http.configure_failpoints((failpoint_deletion_queue, "off")) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + env.pageserver.http_client().deletion_queue_flush(True) # Examine the resulting S3 state. log.info("integrity-check the remote storage") @@ -247,3 +268,12 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): final_stat = future_layer_path.stat() log.info(f"future layer path: {future_layer_path}") assert final_stat.st_mtime != pre_stat.st_mtime + + # Ensure no weird errors in the end... + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + + if attach_mode == "same_generation": + # we should have detected a race upload and deferred it + env.pageserver.assert_log_contains( + "waiting for deletion queue flush to complete before uploading layer" + ) diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 8b41d0cb1c..7f0b541128 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -3,7 +3,7 @@ from __future__ import annotations import re import time from concurrent.futures import ThreadPoolExecutor -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta import pytest from fixtures.common_types import Lsn @@ -207,7 +207,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): for i in range(1000): cur.execute("INSERT INTO foo VALUES(%s)", (i,)) # Get the timestamp at UTC - after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=UTC) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) time.sleep(0.02) @@ -273,11 +273,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): ) log.info("result: %s, after_ts: %s", result, after_timestamp) - # TODO use fromisoformat once we have Python 3.11+ - # which has https://github.com/python/cpython/pull/92177 - timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace( - tzinfo=timezone.utc - ) + timestamp = datetime.fromisoformat(result).replace(tzinfo=UTC) assert timestamp < after_timestamp, "after_timestamp after timestamp" if i > 1: before_timestamp = tbl[i - step_size][1] diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 5eaba78331..f0f12290cc 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -13,7 +11,7 @@ from fixtures.utils import query_scalar # Test on-demand download of the pg_xact SLRUs # @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -79,7 +77,7 @@ def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_ondemand_wal_download.py b/test_runner/regress/test_ondemand_wal_download.py new file mode 100644 index 0000000000..a7eb3e6625 --- /dev/null +++ b/test_runner/regress/test_ondemand_wal_download.py @@ -0,0 +1,27 @@ +from fixtures.neon_fixtures import NeonEnv + + +def test_on_demand_wal_download(neon_simple_env: NeonEnv): + env = neon_simple_env + ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=[ + "max_wal_size=32MB", + "min_wal_size=32MB", + "neon.logical_replication_max_snap_files=10000", + ], + ) + + con = ep.connect() + cur = con.cursor() + cur.execute("CREATE TABLE t(pk bigint primary key, payload text)") + cur.execute("ALTER TABLE t ALTER payload SET STORAGE external") + cur.execute("select pg_create_logical_replication_slot('myslot', 'test_decoding', false, true)") + cur.execute("insert into t values (generate_series(1,100000),repeat('?',10000))") + + ep.stop("fast") + ep.start() + con = ep.connect() + cur = con.cursor() + cur.execute("select pg_replication_slot_advance('myslot', pg_current_wal_insert_lsn())") diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index d1b70b9ee6..05e81b82e0 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -82,7 +80,7 @@ def expect_updated_msg_lsn( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - prev_msg_lsn: Optional[Lsn], + prev_msg_lsn: Lsn | None, ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 4f59efb8b3..6ba5753420 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -11,11 +11,10 @@ of the pageserver are: from __future__ import annotations -import enum import os import re import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TimelineId @@ -41,10 +40,6 @@ from fixtures.remote_storage import ( from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. TENANT_CONF = { @@ -65,7 +60,7 @@ TENANT_CONF = { def read_all( - env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None + env: NeonEnv, tenant_id: TenantId | None = None, timeline_id: TimelineId | None = None ): if tenant_id is None: tenant_id = env.initial_tenant @@ -286,12 +281,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert get_deletion_queue_unexpected_errors(ps_http) == 0 -class KeepAttachment(str, enum.Enum): +class KeepAttachment(StrEnum): KEEP = "keep" LOSE = "lose" -class ValidateBefore(str, enum.Enum): +class ValidateBefore(StrEnum): VALIDATE = "validate" NO_VALIDATE = "no-validate" @@ -464,7 +459,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env.pageserver.start() # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) + # Because it is in emergency mode, it will not attempt to validate deletions required by the initial barrier, and therefore + # other files cannot be uploaded b/c it's waiting for the initial barrier to be validated. + generate_uploads_and_deletions( + env, init=False, pageserver=env.pageserver, wait_until_uploaded=False + ) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 200a323a3a..f6a7bfa1ad 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio import time -from typing import TYPE_CHECKING import psutil import pytest @@ -17,17 +16,13 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.utils import skip_in_debug_build, wait_until -if TYPE_CHECKING: - from typing import Optional - - TIMELINE_COUNT = 10 ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None + env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None ) -> Lsn: if offset is None: offset = 0 @@ -136,7 +131,7 @@ def test_pageserver_small_inmemory_layers( wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) ps_http_client = env.pageserver.http_client() total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) @@ -144,7 +139,7 @@ def test_pageserver_small_inmemory_layers( # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # must be uploaded to remain visible to the pageserver after restart. @@ -185,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # Stop the safekeepers, so that we cannot have any more WAL receiver connections for sk in env.safekeepers: @@ -198,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # The code below verifies that we do not flush on the first write # after an idle period longer than the checkpoint timeout. @@ -215,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) ) - dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # We shouldn't flush since we've just opened a new layer waited_for = 0 @@ -317,4 +312,4 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): dirty_bytes = get_dirty_bytes(env) assert dirty_bytes < max_dirty_data - wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index fb6050689c..4bf5705517 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -2,7 +2,6 @@ from __future__ import annotations import random from contextlib import closing -from typing import Optional import pytest from fixtures.log_helper import log @@ -156,7 +155,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.timeout(540) @pytest.mark.parametrize("shard_count", [None, 4]) @skip_in_debug_build("times out in debug builds") -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | None): # same rationale as with the immediate stop; we might leave orphan layers behind. neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index d4aef96735..a264f4d3c9 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -23,7 +23,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # A tenant configuration that is convenient for generating uploads and deletions @@ -199,7 +199,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) else: - secondary_conf: Optional[dict[str, Any]] = None + secondary_conf: dict[str, Any] | None = None if mode == "Secondary": secondary_conf = {"warm": rng.choice([True, False])} @@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.validate(pageserver_a.id) workload.validate(pageserver_b.id) + # Force compaction on destination pageserver + pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + + # Destination pageserver is in AttachedMulti, it should have generated deletions but + # not enqueued them yet. + # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti + assert ( + pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total", + ) + == 0 + ) + # Revert the origin to secondary log.info("Setting origin to Secondary") pageserver_a.tenant_location_configure( @@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): }, ) + # Transition to AttachedSingle should have drained deletions generated by doing a compaction + # while in AttachedMulti. + def blocked_deletions_drained(): + submitted = pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total" + ) + assert submitted is not None + assert submitted > 0 + + wait_until(10, 0.1, blocked_deletions_drained) + workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) del workload @@ -445,7 +469,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): def list_elegible_layers( - pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + pageserver, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ The subset of layer filenames that are elegible for secondary download: at time of writing this @@ -678,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): else: timeout = int(deadline - now) + 1 try: - wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) except: log.error(f"Timed out waiting for '{expression}'") raise diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f4698191eb..2877f14e0e 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -21,8 +21,6 @@ from fixtures.remote_storage import s3_storage from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: - from typing import Optional - from fixtures.neon_fixtures import PgBin from pytest import CaptureFixture @@ -48,7 +46,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End data properly. """ - ignored_files: Optional[list[str]] = None + ignored_files: list[str] | None = None # Neon handles unlogged relations in a special manner. During a # basebackup, we ship the init fork as the main fork. This presents a @@ -110,13 +108,15 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) - # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # Ensure that compaction/GC works, on a timeline containing all the diversity that postgres regression tests create. # There should have been compactions mid-test as well, this final check is in addition those. for shard, pageserver in tenant_get_shards(env, env.initial_tenant): pageserver.http_client().timeline_checkpoint( shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True ) + pageserver.http_client().timeline_gc(shard, env.initial_timeline, None) + # Run the main PostgreSQL regression tests, in src/test/regress. # @@ -129,7 +129,7 @@ def test_pg_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" @@ -203,7 +203,7 @@ def test_isolation( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "isolation_regression" @@ -272,7 +272,7 @@ def test_sql_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index e59d46e352..5a01d90d85 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -13,7 +13,7 @@ import requests from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any GET_CONNECTION_PID_QUERY = "SELECT pid FROM pg_stat_activity WHERE state = 'active'" @@ -228,7 +228,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): def test_sql_over_http(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( @@ -291,7 +291,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): ) ) - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" response = requests.post( @@ -310,7 +310,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") - def q(sql: str, raw_text: bool, array_mode: bool, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, raw_text: bool, array_mode: bool, params: list[Any] | None = None) -> Any: params = params or [] connstr = ( f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" @@ -346,7 +346,7 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") def qq( - queries: list[tuple[str, Optional[list[Any]]]], + queries: list[tuple[str, list[Any] | None]], read_only: bool = False, deferrable: bool = False, ) -> Any: diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 826136d5f9..fcebf8d23a 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -from typing import Union import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId @@ -122,7 +121,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754") def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): """ Test static endpoint is protected from GC by acquiring and renewing lsn leases. @@ -175,7 +173,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): def get_layers_protected_by_lease( ps_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lease_lsn: Lsn, ) -> set[str]: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 79b5ebe39a..137e75f784 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -5,7 +5,6 @@ import queue import shutil import threading import time -from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId @@ -37,9 +36,6 @@ from fixtures.utils import ( ) from requests import ReadTimeout -if TYPE_CHECKING: - from typing import Optional - # # Tests that a piece of data is backed up and restored correctly: @@ -452,7 +448,7 @@ def test_remote_timeline_client_calls_started_metric( for (file_kind, op_kind), observations in calls_started.items(): log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") assert all( - x < y for x, y in zip(observations, observations[1:]) + x < y for x, y in zip(observations, observations[1:], strict=False) ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): @@ -731,7 +727,7 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # sleep a bit to force the upload task go into exponential backoff time.sleep(1) - q: queue.Queue[Optional[PageserverApiException]] = queue.Queue() + q: queue.Queue[PageserverApiException | None] = queue.Queue() barrier = threading.Barrier(2) def create_in_background(): diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 7a9e6d62b2..8764da3c2f 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -1,7 +1,7 @@ from __future__ import annotations import time -from datetime import datetime, timezone +from datetime import UTC, datetime from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -77,7 +77,7 @@ def test_tenant_s3_restore( # These sleeps are important because they fend off differences in clocks between us and S3 time.sleep(4) - ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) assert ( @@ -104,7 +104,7 @@ def test_tenant_s3_restore( ) time.sleep(4) - ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) ps_http.tenant_time_travel_remote_storage( diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0a4a53356d..411574bd86 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -3,7 +3,7 @@ from __future__ import annotations import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Any +from typing import Any import pytest import requests @@ -19,7 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty -from fixtures.remote_storage import s3_storage +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -27,9 +27,6 @@ from typing_extensions import override from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response -if TYPE_CHECKING: - from typing import Optional, Union - def test_sharding_smoke( neon_env_builder: NeonEnvBuilder, @@ -189,7 +186,7 @@ def test_sharding_split_unsharded( ], ) def test_sharding_split_compaction( - neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str + neon_env_builder: NeonEnvBuilder, failpoint: str | None, build_type: str ): """ Test that after a split, we clean up parent layer data in the child shards via compaction. @@ -515,11 +512,12 @@ def test_sharding_split_smoke( """ - # We will start with 4 shards and split into 8, then migrate all those - # 8 shards onto separate pageservers - shard_count = 4 - split_shard_count = 8 - neon_env_builder.num_pageservers = split_shard_count * 2 + # Shard count we start with + shard_count = 2 + # Shard count we split into + split_shard_count = 4 + # We will have 2 shards per pageserver once done (including secondaries) + neon_env_builder.num_pageservers = split_shard_count # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given @@ -591,7 +589,7 @@ def test_sharding_split_smoke( workload.validate() - assert len(pre_split_pageserver_ids) == 4 + assert len(pre_split_pageserver_ids) == shard_count def shards_on_disk(shard_ids): for pageserver in env.pageservers: @@ -654,9 +652,9 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - shard_count of the child shards will need to fail over to their secondaries - # - shard_count of the child shard secondary locations will get moved to emptier nodes - expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 + # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move) + expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2 + reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) @@ -720,22 +718,10 @@ def test_sharding_split_smoke( # dominated by shard count. log.info(f"total: {total}") assert total == { - 1: 1, - 2: 1, - 3: 1, - 4: 1, - 5: 1, - 6: 1, - 7: 1, - 8: 1, - 9: 1, - 10: 1, - 11: 1, - 12: 1, - 13: 1, - 14: 1, - 15: 1, - 16: 1, + 1: 2, + 2: 2, + 3: 2, + 4: 2, } # The controller is not required to lay out the attached locations in any particular way, but @@ -793,7 +779,7 @@ def test_sharding_split_stripe_size( tenant_id = env.initial_tenant assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -809,7 +795,7 @@ def test_sharding_split_stripe_size( # Check that we ended up with the stripe size that we expected, both on the pageserver # and in the notifications to compute assert len(notifications) == 2 - expect_after: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect_after: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": new_stripe_size, "shards": [ @@ -1057,7 +1043,7 @@ def test_sharding_ingest_gaps( class Failure: - pageserver_id: Optional[int] + pageserver_id: int | None def apply(self, env: NeonEnv): raise NotImplementedError() @@ -1381,7 +1367,7 @@ def test_sharding_split_failures( assert attached_count == initial_shard_count - def assert_split_done(exclude_ps_id: Optional[int] = None) -> None: + def assert_split_done(exclude_ps_id: int | None = None) -> None: secondary_count = 0 attached_count = 0 for ps in env.pageservers: @@ -1419,7 +1405,7 @@ def test_sharding_split_failures( # e.g. while waiting for a storage controller to re-attach a parent shard if we failed # inside the pageserver and the storage controller responds by detaching children and attaching # parents concurrently (https://github.com/neondatabase/neon/issues/7148) - wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) workload.validate() @@ -1685,3 +1671,111 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): ) assert len(top["shards"]) == n_tenants - 4 assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) + + +def test_sharding_gc( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise GC in a sharded tenant: because only shard 0 holds SLRU content, it acts as + the "leader" for GC, and other shards read its index to learn what LSN they should + GC up to. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # A short PITR horizon, so that we won't have to sleep too long in the test to wait for it to + # happen. + "pitr_interval": "1s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, + "image_layer_creation_check_threshold": 0, + "lsn_lease_length": "0s", + } + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_conf=TENANT_CONF + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Create a branch and write some data + workload = Workload(env, tenant_id, timeline_id) + initial_lsn = Lsn(workload.endpoint().safe_psql("SELECT pg_current_wal_lsn()")[0][0]) + log.info(f"Started at LSN: {initial_lsn}") + + workload.init() + + # Write enough data to generate multiple layers + for _i in range(10): + last_lsn = workload.write_rows(32) + + assert last_lsn > initial_lsn + + log.info(f"Wrote up to last LSN: {last_lsn}") + + # Do full image layer generation. When we subsequently wait for PITR, all historic deltas + # should be GC-able + for shard_number in range(shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_compact( + shard, timeline_id, force_image_layer_creation=True + ) + + workload.churn_rows(32) + + time.sleep(5) + + # Invoke GC on a non-zero shard and verify its GC cutoff LSN does not advance + shard_one = TenantShardId(tenant_id, 1, shard_count) + env.get_tenant_pageserver(shard_one).http_client().timeline_gc( + shard_one, timeline_id, gc_horizon=None + ) + + # Check shard 1's index - GC cutoff LSN should not have advanced + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_1_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_one, timeline_id=timeline_id + ) + shard_1_gc_cutoff_lsn = Lsn(shard_1_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 1 cutoff LSN: {shard_1_gc_cutoff_lsn}") + assert shard_1_gc_cutoff_lsn <= last_lsn + + shard_zero = TenantShardId(tenant_id, 0, shard_count) + env.get_tenant_pageserver(shard_zero).http_client().timeline_gc( + shard_zero, timeline_id, gc_horizon=None + ) + + # TODO: observe that GC LSN of shard 0 has moved forward in remote storage + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_0_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_zero, timeline_id=timeline_id + ) + shard_0_gc_cutoff_lsn = Lsn(shard_0_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 0 cutoff LSN: {shard_0_gc_cutoff_lsn}") + assert shard_0_gc_cutoff_lsn >= last_lsn + + # Invoke GC on all other shards and verify their GC cutoff LSNs + for shard_number in range(1, shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_gc( + shard, timeline_id, gc_horizon=None + ) + + # Verify GC cutoff LSN advanced to match shard 0 + shard_index = env.pageserver_remote_storage.index_content( + tenant_id=shard, timeline_id=timeline_id + ) + shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") + assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 402f27b384..2a26fef59a 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -3,7 +3,6 @@ from __future__ import annotations import socket import subprocess from pathlib import Path -from types import TracebackType from typing import TYPE_CHECKING import backoff @@ -12,7 +11,8 @@ from fixtures.neon_fixtures import PgProtocol, VanillaPostgres from fixtures.port_distributor import PortDistributor if TYPE_CHECKING: - from typing import Optional + from types import TracebackType + from typing import Self def generate_tls_cert(cn, certout, keyout): @@ -55,10 +55,10 @@ class PgSniRouter(PgProtocol): self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None self.test_output_dir = test_output_dir - def start(self) -> PgSniRouter: + def start(self) -> Self: assert self._popen is None args = [ str(self.neon_binpath / "pg_sni_router"), @@ -91,14 +91,14 @@ class PgSniRouter(PgProtocol): if self._popen: self._popen.wait(timeout=2) - def __enter__(self) -> PgSniRouter: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2c3d79b18a..13bc54a114 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -5,7 +5,7 @@ import json import threading import time from collections import defaultdict -from datetime import datetime, timezone +from datetime import UTC, datetime from enum import Enum from typing import TYPE_CHECKING @@ -56,7 +56,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -593,7 +593,7 @@ def test_storage_controller_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -708,7 +708,7 @@ def test_storage_controller_stuck_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -1048,7 +1048,7 @@ def test_storage_controller_s3_time_travel_recovery( ) time.sleep(4) - ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Simulate a "disaster": delete some random files from remote storage for one of the shards @@ -1072,7 +1072,7 @@ def test_storage_controller_s3_time_travel_recovery( pass time.sleep(4) - ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Do time travel recovery @@ -2274,7 +2274,7 @@ def test_storage_controller_node_deletion( @pytest.mark.parametrize("shard_count", [None, 2]) def test_storage_controller_metadata_health( neon_env_builder: NeonEnvBuilder, - shard_count: Optional[int], + shard_count: int | None, ): """ Create three tenants A, B, C. @@ -2494,14 +2494,14 @@ def start_env(env: NeonEnv, storage_controller_port: int): for pageserver in env.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in env.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 11ad2173ae..3991bd7061 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -6,7 +6,6 @@ import shutil import threading import time from concurrent.futures import ThreadPoolExecutor -from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -20,12 +19,9 @@ from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: int | None): """ Test the `tenant-snapshot` subcommand, which grabs data from remote storage @@ -131,7 +127,7 @@ def drop_local_state(env: NeonEnv, tenant_id: TenantId): @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -179,9 +175,7 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt @pytest.mark.parametrize("shard_count", [None, 2]) -def test_scrubber_physical_gc_ancestors( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] -): +def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -499,7 +493,7 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_scan_pageserver_metadata( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] + neon_env_builder: NeonEnvBuilder, shard_count: int | None ): """ Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 59c14b3263..8d7ca7bc4e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,11 +1,10 @@ from __future__ import annotations import asyncio -import enum import random import time +from enum import StrEnum from threading import Thread -from typing import TYPE_CHECKING import asyncpg import pytest @@ -28,10 +27,6 @@ from fixtures.remote_storage import ( from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample -if TYPE_CHECKING: - from typing import Optional - - # In tests that overlap endpoint activity with tenant attach/detach, there are # a variety of warnings that the page service may emit when it cannot acquire # an active tenant to serve a request @@ -57,7 +52,7 @@ def do_gc_target( log.info("gc http thread returning") -class ReattachMode(str, enum.Enum): +class ReattachMode(StrEnum): REATTACH_EXPLICIT = "explicit" REATTACH_RESET = "reset" REATTACH_RESET_DROP = "reset_drop" @@ -498,7 +493,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( r".* Changing Active tenant to Broken state, reason: broken from test" ) - def only_int(samples: list[Sample]) -> Optional[int]: + def only_int(samples: list[Sample]) -> int | None: if len(samples) == 1: return int(samples[0].value) assert len(samples) == 0 diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index fc9adb14c9..bf6120aa0a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -28,7 +28,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -78,7 +78,7 @@ def populate_branch( tenant_id: TenantId, ps_http: PageserverHttpClient, create_table: bool, - expected_sum: Optional[int], + expected_sum: int | None, ) -> tuple[TimelineId, Lsn]: # insert some data with pg_cur(endpoint) as cur: diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 5a499ea98b..158c3fddb0 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -369,12 +369,16 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): - Bad response codes during shutdown (e.g. returning 500 instead of 503) - Issues where a tenant is still starting up while we receive a request for it - Issues with interrupting/resuming tenant/timeline creation in shutdown + - Issues with a timeline is not created successfully because of restart. """ env = neon_env_builder.init_configs() env.start() tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline + # At this point, the initial tenant/timeline might not have been created successfully, + # and this is the case we want to test. + # Multiple creation requests which race will generate this error on the pageserver # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index ba4e79c343..bc2e048f69 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -4,7 +4,6 @@ import json import random import threading import time -from typing import Optional import pytest import requests @@ -23,7 +22,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import S3Storage, s3_storage -from fixtures.utils import run_only_on_default_postgres, wait_until +from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) @@ -390,6 +389,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel @run_only_on_default_postgres("this test isn't sensitive to the contents of timelines") +@skip_in_debug_build("times out in debug builds") def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): """ A general consistency check on archival/offload timeline state, and its intersection @@ -416,7 +416,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): [ ".*error sending request.*", # FIXME: the pageserver should not return 500s on cancellation (https://github.com/neondatabase/neon/issues/97680) - ".*InternalServerError(Error deleting timeline .* on .* on .*: pageserver API: error: Cancelled", + ".*InternalServerError\\(Error deleting timeline .* on .* on .*: pageserver API: error: Cancelled", ] ) @@ -660,7 +660,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ], ) def test_timeline_retain_lsn( - neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: Optional[str] + neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: str | None ): """ Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index ef0eb05612..cd4e0a5f3b 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,6 +5,7 @@ import enum import threading import time from concurrent.futures import ThreadPoolExecutor +from enum import StrEnum from queue import Empty, Queue from threading import Barrier @@ -22,7 +23,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.utils import assert_pageserver_backups_equal, wait_until +from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until +from fixtures.workload import Workload from requests import ReadTimeout @@ -36,7 +38,7 @@ def layer_name(info: HistoricLayerInfo) -> str: @enum.unique -class Branchpoint(str, enum.Enum): +class Branchpoint(StrEnum): """ Have branches at these Lsns possibly relative to L0 layer boundary. """ @@ -1549,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) +@skip_in_debug_build("only run with release build") +def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 ** 2}", + "lsn_lease_length": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 50 + + ps_http = env.pageserver.http_client() + + workload_parent = Workload(env, tenant_id, timeline_id) + workload_parent.init(env.pageserver.id) + log.info("Writing initial data ...") + workload_parent.write_rows(row_count, env.pageserver.id) + branch_id = env.create_branch("child") + workload_child = Workload(env, tenant_id, branch_id, branch_name="child") + workload_child.init(env.pageserver.id, allow_recreate=True) + log.info("Writing initial data on child...") + workload_child.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload_parent.churn_rows(row_count, env.pageserver.id) + workload_child.churn_rows(row_count, env.pageserver.id) + + ps_http.detach_ancestor(tenant_id, branch_id) + + log.info("Validating at workload end ...") + workload_parent.validate(env.pageserver.id) + workload_child.validate(env.pageserver.id) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index c19c78e251..5a5ca3290a 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -3,7 +3,6 @@ from __future__ import annotations import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log @@ -14,9 +13,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import wait_timeline_detail_404 -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("sharded", [True, False]) def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): @@ -89,7 +85,7 @@ def wait_for_another_gc_round(): @dataclass class ScrollableLog: pageserver: NeonPageserver - offset: Optional[LogCursor] + offset: LogCursor | None def assert_log_contains(self, what: str): msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 85c6d17142..4528bc6180 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -7,7 +7,6 @@ import time from collections import defaultdict from contextlib import closing from pathlib import Path -from typing import Optional import psycopg2.errors import psycopg2.extras @@ -668,7 +667,7 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder): class TimelinePhysicalSizeValues: api_current_physical: int prometheus_resident_physical: float - prometheus_remote_physical: Optional[float] = None + prometheus_remote_physical: float | None = None python_timelinedir_layerfiles_physical: int layer_map_file_size_sum: int diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0676b3dd9a..405f15e488 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -61,7 +61,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def wait_lsn_force_checkpoint( @@ -189,7 +189,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) - for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): + for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns, strict=False): # Invariant. May be < when transaction is in progress. assert ( commit_lsn <= flush_lsn @@ -224,7 +224,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): def __init__(self) -> None: super().__init__(daemon=True) self.should_stop = threading.Event() - self.exception: Optional[BaseException] = None + self.exception: BaseException | None = None def run(self) -> None: try: @@ -521,7 +521,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] - for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end, strict=False): victim.stop() # roughly fills one segment cur.execute("insert into t select generate_series(1,250000), 'payload'") @@ -666,7 +666,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # recreate timeline on pageserver from scratch ps_http.timeline_create( - pg_version=PgVersion(pg_version), + pg_version=PgVersion(str(pg_version)), tenant_id=tenant_id, new_timeline_id=timeline_id, ) @@ -1177,14 +1177,14 @@ def cmp_sk_wal(sks: list[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] - for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + for tfl, sk in zip(term_flush_lsns[1:], sks[1:], strict=False): assert ( term_flush_lsns[0] == tfl ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] - for cmp_segs, sk in zip(segs[1:], sks[1:]): + for cmp_segs, sk in zip(segs[1:], sks[1:], strict=False): assert ( segs[0] == cmp_segs ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" @@ -1455,12 +1455,12 @@ class SafekeeperEnv: self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") - self.safekeepers: Optional[list[subprocess.CompletedProcess[Any]]] = None - self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[TenantId] = None - self.timeline_id: Optional[TimelineId] = None + self.safekeepers: list[subprocess.CompletedProcess[Any]] | None = None + self.postgres: ProposerPostgres | None = None + self.tenant_id: TenantId | None = None + self.timeline_id: TimelineId | None = None - def init(self) -> SafekeeperEnv: + def init(self) -> Self: assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" @@ -1541,7 +1541,7 @@ class SafekeeperEnv: log.info(f"Killing safekeeper with pid {pid}") os.kill(pid, signal.SIGKILL) - def __enter__(self): + def __enter__(self) -> Self: return self def __exit__(self, exc_type, exc_value, traceback): @@ -1784,6 +1784,89 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): cur.execute("INSERT INTO t (key) VALUES (123)") +def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload. diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index d3e989afa8..18408b0619 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -5,7 +5,6 @@ import random import time from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import asyncpg import pytest @@ -16,10 +15,6 @@ from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import skip_in_debug_build -if TYPE_CHECKING: - from typing import Optional - - log = getLogger("root.safekeeper_async") @@ -261,7 +256,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): def endpoint_create_start( - env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False + env: NeonEnv, branch: str, pgdir_name: str | None, allow_multiple: bool = False ): endpoint = Endpoint( env, @@ -287,7 +282,7 @@ async def exec_compute_query( env: NeonEnv, branch: str, query: str, - pgdir_name: Optional[str] = None, + pgdir_name: str | None = None, allow_multiple: bool = False, ): with endpoint_create_start( @@ -705,7 +700,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat # invalid, to make them unavailable to the endpoint. We use # ports 10, 11 and 12 to simulate unavailable safekeepers. config = toml.load(test_output_dir / "repo" / "config") - for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk)): + for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk, strict=False)): if active: config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg else: diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c5e0d642ef..284ae56be2 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c5e0d642efb02e4bfedc283b0a7707fe6c79cc89 +Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 1feff6b60f..aed79ee87b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 1feff6b60f07cb71b665d0f5ead71a4320a71743 +Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b0b693ea29..f5cfc6fa89 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b0b693ea298454e95e6b154780d1fd586a244dfd +Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index aa2e29f2b6..3c15b6565f 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit aa2e29f2b6952140dfe51876bbd11054acae776f +Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f diff --git a/vendor/revisions.json b/vendor/revisions.json index a1f2bc5dd1..4dae88e73d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.1", - "aa2e29f2b6952140dfe51876bbd11054acae776f" + "17.2", + "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f" ], "v16": [ - "16.5", - "b0b693ea298454e95e6b154780d1fd586a244dfd" + "16.6", + "f5cfc6fa898544050e821ac688adafece1ac3cff" ], "v15": [ - "15.9", - "1feff6b60f07cb71b665d0f5ead71a4320a71743" + "15.10", + "aed79ee87b94779cc52ec13e3b74eba6ada93f05" ], "v14": [ - "14.14", - "c5e0d642efb02e4bfedc283b0a7707fe6c79cc89" + "14.15", + "284ae56be2397fd3eaf20777fa220b2d0ad968f5" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 53d3a7364b..a73d9d6352 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,8 @@ ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } axum = { version = "0.7", features = ["ws"] } axum-core = { version = "0.4", default-features = false, features = ["tracing"] } -base64 = { version = "0.21", features = ["alloc"] } +base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } @@ -52,6 +53,7 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } +nix = { version = "0.26" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] }