diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..c8fd1209de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,6 @@ + +blank_issues_enabled: true +contact_links: + - name: Feature request + url: https://console.neon.tech/app/projects?modal=feedback + about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech` diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 4ad8a7b460..1b602883c5 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -7,6 +7,13 @@ self-hosted-runner: - small-arm64 - us-east-2 config-variables: + - AZURE_DEV_CLIENT_ID + - AZURE_DEV_REGISTRY_NAME + - AZURE_DEV_SUBSCRIPTION_ID + - AZURE_PROD_CLIENT_ID + - AZURE_PROD_REGISTRY_NAME + - AZURE_PROD_SUBSCRIPTION_ID + - AZURE_TENANT_ID - BENCHMARK_PROJECT_ID_PUB - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 6c2cee0971..4008cd0d36 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -71,7 +71,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest # The lack of compatibility snapshot (for example, for the new Postgres version) @@ -211,13 +211,13 @@ runs: fi - name: Upload compatibility snapshot - if: github.ref_name == 'release' + # Note, that we use `github.base_ref` which is a target branch for a PR + if: github.event_name == 'pull_request' && github.base_ref == 'release' uses: ./.github/actions/upload with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ - prefix: latest - name: Upload test results if: ${{ !cancelled() }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5e9fff0e6a..67152b6991 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -62,7 +62,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -83,6 +83,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -136,6 +140,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -148,6 +159,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: mold -run make postgres-v16 -j$(nproc) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: mold -run make postgres-v17 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -210,14 +225,20 @@ jobs: run: | PQ_LIB_DIR=$(pwd)/pg_install/v16/lib export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + # run all non-pageserver tests + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' + + # run pageserver tests with different settings for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + for io_buffer_alignment in 0 1 512 ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done # Run separate tests for real S3 diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml new file mode 100644 index 0000000000..c304172ff7 --- /dev/null +++ b/.github/workflows/_push-to-acr.yml @@ -0,0 +1,56 @@ +name: Push images to ACR +on: + workflow_call: + inputs: + client_id: + description: Client ID of Azure managed identity or Entra app + required: true + type: string + image_tag: + description: Tag for the container image + required: true + type: string + images: + description: Images to push + required: true + type: string + registry_name: + description: Name of the container registry + required: true + type: string + subscription_id: + description: Azure subscription ID + required: true + type: string + tenant_id: + description: Azure tenant ID + required: true + type: string + +jobs: + push-to-acr: + runs-on: ubuntu-22.04 + permissions: + contents: read # This is required for actions/checkout + id-token: write # This is required for Azure Login to work. + + steps: + - name: Azure login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ inputs.client_id }} + subscription-id: ${{ inputs.subscription_id }} + tenant-id: ${{ inputs.tenant_id }} + + - name: Login to ACR + run: | + az acr login --name=${{ inputs.registry_name }} + + - name: Copy docker images to ACR ${{ inputs.registry_name }} + run: | + images='${{ inputs.images }}' + for image in ${images}; do + docker buildx imagetools create \ + -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ + neondatabase/${image}:${{ inputs.image_tag }} + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1e7f3598c2..a210c962cb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,8 +54,8 @@ jobs: build-tag: ${{steps.build-tag.outputs.tag}} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -211,7 +211,7 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -286,6 +286,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + SYNC_AFTER_EACH_TEST: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -356,6 +357,7 @@ jobs: }) coverage-report: + if: ${{ !startsWith(github.ref_name, 'release') }} needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: @@ -372,8 +374,8 @@ jobs: coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }} coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` for differential coverage (to get diff between two commits) + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -474,11 +476,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -547,17 +547,15 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -626,7 +624,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v16' + if: matrix.version == 'v17' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -648,7 +646,7 @@ jobs: strategy: matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] steps: - uses: docker/login-action@v3 @@ -670,7 +668,7 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ @@ -688,7 +686,7 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -699,15 +697,12 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] env: VM_BUILDER_VERSION: v0.29.3 steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Downloading vm-builder run: | @@ -747,10 +742,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 @@ -793,14 +785,11 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: - permissions: - contents: read # This is required for actions/checkout - id-token: write # This is required for Azure Login to work. needs: [ check-permissions, tag, test-images, vm-compute-node-image ] runs-on: ubuntu-22.04 env: - VERSIONS: v14 v15 v16 + VERSIONS: v14 v15 v16 v17 steps: - uses: docker/login-action@v3 @@ -822,28 +811,6 @@ jobs: neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done - - name: Azure login - if: github.ref_name == 'main' - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - if: github.ref_name == 'main' - run: | - az acr login --name=neoneastus2 - - - name: Copy docker images to ACR-dev - if: github.ref_name == 'main' - run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do - docker buildx imagetools create \ - -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/${image}:${{ needs.tag.outputs.build-tag }} - done - - name: Add latest tag to images if: github.ref_name == 'main' run: | @@ -863,7 +830,7 @@ jobs: done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Login to prod ECR uses: docker/login-action@v3 @@ -876,11 +843,35 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done + push-to-acr-dev: + if: github.ref_name == 'main' + needs: [ tag, promote-images ] + uses: ./.github/workflows/_push-to-acr.yml + with: + client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} + image_tag: ${{ needs.tag.outputs.build-tag }} + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + tenant_id: ${{ vars.AZURE_TENANT_ID }} + + push-to-acr-prod: + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + needs: [ tag, promote-images ] + uses: ./.github/workflows/_push-to-acr.yml + with: + client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} + image_tag: ${{ needs.tag.outputs.build-tag }} + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + tenant_id: ${{ vars.AZURE_TENANT_ID }} + trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] runs-on: ubuntu-22.04 @@ -956,8 +947,9 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest @@ -971,15 +963,12 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Trigger deploy workflow env: @@ -1055,43 +1044,90 @@ jobs: generate_release_notes: true, }) + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, build-and-test-locally ] - if: github.ref_name == 'release' + needs: [ deploy ] + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + if: github.ref_name == 'release' && !failure() && !cancelled() - runs-on: [ self-hosted, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init + runs-on: ubuntu-22.04 steps: - - name: Promote compatibility snapshot for the release + - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR + id: fetch-last-release-pr-info + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + branch_name_and_pr_number=$(gh pr list \ + --repo "${GITHUB_REPOSITORY}" \ + --base release \ + --state merged \ + --limit 10 \ + --json mergeCommit,headRefName,number \ + --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") + branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') + pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') + + run_id=$(gh run list \ + --repo "${GITHUB_REPOSITORY}" \ + --workflow build_and_test.yml \ + --branch "${branch_name}" \ + --json databaseId \ + --limit 1 \ + --jq '.[].databaseId') + + last_commit_sha=$(gh pr view "${pr_number}" \ + --repo "${GITHUB_REPOSITORY}" \ + --json commits \ + --jq '.commits[-1].oid') + + echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} + echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev - PREFIX: artifacts/latest - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + AWS_REGION: eu-central-1 + COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} + RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} run: | - # Update compatibility snapshot for the release - for pg_version in v14 v15 v16; do - for build_type in debug release; do - OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst - NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst + old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" + new_prefix="artifacts/latest" - time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + files_to_promote=() + files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true) + + for arch in X64 ARM64; do + for build_type in debug release; do + neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + + # TODO Add v17 + for pg_version in v14 v15 v16; do + # We run less tests for debug builds, so we don't need to promote them + if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then + continue + fi + + compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + done done done - # Update Neon artifact for the release (reuse already uploaded artifact) - for build_type in debug release; do - OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID} - FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst - - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) - if [ -z "${S3_KEY}" ]; then - echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" - exit 1 - fi - - time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} + for f in "${files_to_promote[@]}"; do + time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/ done pin-build-tools-image: diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml index 585d118dfb..b7cbc06a73 100644 --- a/.github/workflows/label-for-external-users.yml +++ b/.github/workflows/label-for-external-users.yml @@ -7,6 +7,11 @@ on: pull_request_target: types: - opened + workflow_dispatch: + inputs: + github-actor: + description: 'GitHub username. If empty, the username of the current user will be used' + required: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -26,12 +31,31 @@ jobs: id: check-user env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + ACTOR: ${{ inputs.github-actor || github.actor }} run: | - if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then - is_member=true - else - is_member=false - fi + expected_error="User does not exist or is not a member of the organization" + output_file=output.txt + + for i in $(seq 1 10); do + if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then + + is_member=true + break + elif grep -q "${expected_error}" ${output_file}; then + is_member=false + break + elif [ $i -eq 10 ]; then + title="Failed to get memmbership status for ${ACTOR}" + message="The latest GitHub API error message: '$(cat ${output_file})'" + echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}" + + exit 1 + fi + + sleep 1 + done echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7fecdbde8c..140aac032a 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 @@ -93,6 +97,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -120,6 +131,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: make postgres-v16 -j$(sysctl -n hw.ncpu) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: make postgres-v17 -j$(sysctl -n hw.ncpu) + - name: Build neon extensions run: make neon-pg-ext -j$(sysctl -n hw.ncpu) @@ -166,7 +181,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) + run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 6fbe785c56..b299cf9b99 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -34,8 +34,8 @@ jobs: build-tag: ${{ steps.build-tag.outputs.tag }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.gitmodules b/.gitmodules index 1d925674a1..d1330bf28c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,7 @@ path = vendor/postgres-v16 url = https://github.com/neondatabase/postgres.git branch = REL_16_STABLE_neon +[submodule "vendor/postgres-v17"] + path = vendor/postgres-v17 + url = https://github.com/neondatabase/postgres.git + branch = REL_17_STABLE_neon diff --git a/Cargo.lock b/Cargo.lock index a506da8c02..136f07956f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -915,27 +915,30 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.65.1" +version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "cexpr", "clang-sys", - "lazy_static", - "lazycell", + "itertools 0.12.1", "log", - "peeking_take_while", - "prettyplease 0.2.6", + "prettyplease 0.2.17", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", "syn 2.0.52", - "which", ] +[[package]] +name = "bit_field" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" + [[package]] name = "bitflags" version = "1.3.2" @@ -1186,9 +1189,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.1.4" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ "crossterm", "strum", @@ -1206,7 +1209,6 @@ dependencies = [ "remote_storage", "serde", "serde_json", - "serde_with", "utils", ] @@ -1215,7 +1217,6 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "bytes", "cfg-if", "chrono", @@ -1234,7 +1235,6 @@ dependencies = [ "reqwest 0.12.4", "rlimit", "rust-ini", - "serde", "serde_json", "signal-hook", "tar", @@ -1243,7 +1243,6 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1314,12 +1313,9 @@ dependencies = [ name = "consumption_metrics" version = "0.1.0" dependencies = [ - "anyhow", "chrono", "rand 0.8.5", "serde", - "serde_with", - "utils", ] [[package]] @@ -1327,14 +1323,11 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "camino", "clap", "comfy-table", "compute_api", - "futures", "git-version", - "hex", "humantime", "humantime-serde", "hyper 0.14.26", @@ -1342,7 +1335,6 @@ dependencies = [ "once_cell", "pageserver_api", "pageserver_client", - "postgres", "postgres_backend", "postgres_connection", "regex", @@ -1351,15 +1343,13 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "serde_with", "storage_broker", - "tar", "thiserror", "tokio", "tokio-postgres", "tokio-util", - "toml 0.7.4", - "toml_edit 0.19.10", + "toml", + "toml_edit", "tracing", "url", "utils", @@ -1483,25 +1473,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" -version = "0.25.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "crossterm_winapi", "libc", - "mio", "parking_lot 0.12.1", - "signal-hook", - "signal-hook-mio", "winapi", ] [[package]] name = "crossterm_winapi" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" dependencies = [ "winapi", ] @@ -1664,7 +1651,6 @@ dependencies = [ "hex", "parking_lot 0.12.1", "rand 0.8.5", - "scopeguard", "smallvec", "tracing", "utils", @@ -1672,9 +1658,9 @@ dependencies = [ [[package]] name = "diesel" -version = "2.2.1" +version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b" +checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71" dependencies = [ "bitflags 2.4.1", "byteorder", @@ -2234,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git-version" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" dependencies = [ "git-version-macro", - "proc-macro-hack", ] [[package]] name = "git-version-macro" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -2722,6 +2706,12 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "infer" version = "0.2.3" @@ -2739,19 +2729,6 @@ dependencies = [ "libc", ] -[[package]] -name = "inotify" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" -dependencies = [ - "bitflags 1.3.2", - "futures-core", - "inotify-sys", - "libc", - "tokio", -] - [[package]] name = "inotify-sys" version = "0.1.5" @@ -2938,23 +2915,6 @@ dependencies = [ "spin 0.5.2", ] -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - -[[package]] -name = "leaky-bucket" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853" -dependencies = [ - "parking_lot 0.12.1", - "tokio", - "tracing", -] - [[package]] name = "libc" version = "0.2.150" @@ -3153,7 +3113,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff" dependencies = [ "serde", - "toml 0.8.14", + "toml", ] [[package]] @@ -3263,7 +3223,7 @@ dependencies = [ "crossbeam-channel", "filetime", "fsevent-sys", - "inotify 0.9.6", + "inotify", "kqueue", "libc", "log", @@ -3654,7 +3614,6 @@ name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", - "bytes", "camino", "clap", "git-version", @@ -3663,13 +3622,12 @@ dependencies = [ "pageserver_api", "postgres_ffi", "remote_storage", - "serde", "serde_json", "svg_fmt", "thiserror", "tokio", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "utils", "workspace_hack", ] @@ -3682,23 +3640,20 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", - "async-trait", + "bit_field", "byteorder", "bytes", "camino", "camino-tempfile", "chrono", "clap", - "const_format", "consumption_metrics", "crc32c", "criterion", - "crossbeam-utils", "either", "enum-map", "enumset", "fail", - "flate2", "futures", "git-version", "hex", @@ -3706,8 +3661,8 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", + "indoc", "itertools 0.10.5", - "leaky-bucket", "md5", "metrics", "nix 0.27.1", @@ -3732,17 +3687,14 @@ dependencies = [ "reqwest 0.12.4", "rpds", "scopeguard", + "send-future", "serde", "serde_json", "serde_path_to_error", "serde_with", - "signal-hook", - "smallvec", "storage_broker", "strum", "strum_macros", - "svg_fmt", - "sync_wrapper", "sysinfo", "tenant_size_model", "thiserror", @@ -3754,9 +3706,8 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", - "twox-hash", "url", "utils", "walkdir", @@ -3771,6 +3722,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "camino", "chrono", "const_format", "enum-map", @@ -3778,11 +3730,16 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", + "nix 0.27.1", + "postgres_backend", "postgres_ffi", "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", + "storage_broker", "strum", "strum_macros", "thiserror", @@ -3794,7 +3751,6 @@ name = "pageserver_client" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", "futures", "pageserver_api", @@ -3815,44 +3771,22 @@ name = "pageserver_compaction" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "async-stream", - "byteorder", - "bytes", - "chrono", "clap", - "const_format", - "consumption_metrics", "criterion", - "crossbeam-utils", - "either", - "fail", - "flate2", "futures", "git-version", - "hex", "hex-literal", - "humantime", - "humantime-serde", "itertools 0.10.5", - "metrics", "once_cell", "pageserver_api", "pin-project-lite", "rand 0.8.5", - "smallvec", "svg_fmt", - "sync_wrapper", - "thiserror", "tokio", - "tokio-io-timeout", - "tokio-util", "tracing", - "tracing-error", "tracing-subscriber", - "url", "utils", - "walkdir", "workspace_hack", ] @@ -3912,8 +3846,9 @@ dependencies = [ [[package]] name = "parquet" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "bytes", @@ -3932,8 +3867,9 @@ dependencies = [ [[package]] name = "parquet_derive" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e" dependencies = [ "parquet", "proc-macro2", @@ -3970,12 +3906,6 @@ dependencies = [ "sha2", ] -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "pem" version = "3.0.3" @@ -4129,7 +4059,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4142,7 +4072,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -4161,7 +4091,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4173,9 +4103,7 @@ name = "postgres_backend" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", - "futures", "once_cell", "pq_proto", "rustls 0.22.4", @@ -4208,16 +4136,13 @@ version = "0.1.0" dependencies = [ "anyhow", "bindgen", - "byteorder", "bytes", "crc32c", "env_logger", - "hex", "log", "memoffset 0.8.0", "once_cell", "postgres", - "rand 0.8.5", "regex", "serde", "thiserror", @@ -4252,13 +4177,11 @@ dependencies = [ "byteorder", "bytes", "itertools 0.10.5", - "pin-project-lite", "postgres-protocol", "rand 0.8.5", "serde", "thiserror", "tokio", - "tracing", ] [[package]] @@ -4273,9 +4196,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.6" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", "syn 2.0.52", @@ -4290,12 +4213,6 @@ dependencies = [ "elliptic-curve 0.13.8", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.78" @@ -4414,7 +4331,6 @@ dependencies = [ "aws-config", "aws-sdk-iam", "aws-sigv4", - "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -4423,7 +4339,6 @@ dependencies = [ "chrono", "clap", "consumption_metrics", - "crossbeam-deque", "dashmap", "ecdsa 0.16.9", "env_logger", @@ -4449,11 +4364,9 @@ dependencies = [ "jose-jwa", "jose-jwk", "lasso", - "md5", "measured", "metrics", "once_cell", - "opentelemetry", "p256 0.13.2", "parking_lot 0.12.1", "parquet", @@ -4474,7 +4387,6 @@ dependencies = [ "reqwest-middleware", "reqwest-retry", "reqwest-tracing", - "routerify", "rsa", "rstest", "rustc-hash", @@ -4490,7 +4402,6 @@ dependencies = [ "smol_str", "socket2 0.5.5", "subtle", - "task-local-extensions", "thiserror", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -4500,7 +4411,6 @@ dependencies = [ "tokio-rustls 0.25.0", "tokio-tungstenite", "tokio-util", - "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -4790,7 +4700,6 @@ dependencies = [ "async-stream", "async-trait", "aws-config", - "aws-credential-types", "aws-sdk-s3", "aws-smithy-async", "aws-smithy-types", @@ -4804,7 +4713,6 @@ dependencies = [ "futures", "futures-util", "http-types", - "humantime", "humantime-serde", "hyper 0.14.26", "itertools 0.10.5", @@ -4820,7 +4728,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "utils", ] @@ -5284,14 +5192,12 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "async-trait", "byteorder", "bytes", "camino", "camino-tempfile", "chrono", "clap", - "const_format", "crc32c", "desim", "fail", @@ -5317,9 +5223,7 @@ dependencies = [ "sd-notify", "serde", "serde_json", - "serde_with", "sha2", - "signal-hook", "storage_broker", "strum", "strum_macros", @@ -5330,7 +5234,6 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", "tracing", "tracing-subscriber", "url", @@ -5345,7 +5248,6 @@ version = "0.1.0" dependencies = [ "const_format", "serde", - "serde_with", "utils", ] @@ -5455,6 +5357,12 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +[[package]] +name = "send-future" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87" + [[package]] name = "sentry" version = "0.32.3" @@ -5590,11 +5498,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -5732,17 +5641,6 @@ dependencies = [ "signal-hook-registry", ] -[[package]] -name = "signal-hook-mio" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" -dependencies = [ - "libc", - "mio", - "signal-hook", -] - [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -5878,7 +5776,6 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "bytes", "clap", "const_format", "futures", @@ -5892,7 +5789,6 @@ dependencies = [ "parking_lot 0.12.1", "prost", "tokio", - "tokio-stream", "tonic", "tonic-build", "tracing", @@ -5905,9 +5801,7 @@ name = "storage_controller" version = "0.1.0" dependencies = [ "anyhow", - "aws-config", "bytes", - "camino", "chrono", "clap", "control_plane", @@ -5948,21 +5842,9 @@ dependencies = [ name = "storage_controller_client" version = "0.1.0" dependencies = [ - "anyhow", - "async-trait", - "bytes", - "futures", - "pageserver_api", "pageserver_client", - "postgres", "reqwest 0.12.4", "serde", - "thiserror", - "tokio", - "tokio-postgres", - "tokio-stream", - "tokio-util", - "utils", "workspace_hack", ] @@ -5974,13 +5856,9 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", "camino", "chrono", "clap", - "crc32c", "either", "futures", "futures-util", @@ -5992,20 +5870,16 @@ dependencies = [ "pageserver", "pageserver_api", "postgres_ffi", - "rand 0.8.5", "remote_storage", "reqwest 0.12.4", "rustls 0.22.4", "rustls-native-certs 0.7.0", "serde", "serde_json", - "serde_with", "storage_controller_client", - "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", "tokio-stream", "tokio-util", "tracing", @@ -6024,14 +5898,11 @@ dependencies = [ "comfy-table", "futures", "humantime", - "hyper 0.14.26", "pageserver_api", "pageserver_client", "reqwest 0.12.4", - "serde", "serde_json", "storage_controller_client", - "thiserror", "tokio", "tracing", "utils", @@ -6056,21 +5927,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.24.1" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" [[package]] name = "strum_macros" -version = "0.24.3" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -6081,8 +5952,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.2" -source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" [[package]] name = "syn" @@ -6153,15 +6025,6 @@ dependencies = [ "xattr", ] -[[package]] -name = "task-local-extensions" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" -dependencies = [ - "pin-utils", -] - [[package]] name = "tempfile" version = "3.9.0" @@ -6410,7 +6273,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -6521,18 +6384,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "toml" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec" -dependencies = [ - "serde", - "serde_spanned", - "toml_datetime", - "toml_edit 0.19.10", -] - [[package]] name = "toml" version = "0.8.14" @@ -6542,7 +6393,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.14", + "toml_edit", ] [[package]] @@ -6554,19 +6405,6 @@ dependencies = [ "serde", ] -[[package]] -name = "toml_edit" -version = "0.19.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" -dependencies = [ - "indexmap 1.9.3", - "serde", - "serde_spanned", - "toml_datetime", - "winnow 0.4.6", -] - [[package]] name = "toml_edit" version = "0.22.14" @@ -6577,7 +6415,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "winnow 0.6.13", + "winnow", ] [[package]] @@ -6777,7 +6615,6 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -6952,7 +6789,6 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", - "async-trait", "bincode", "byteorder", "bytes", @@ -6968,7 +6804,6 @@ dependencies = [ "humantime", "hyper 0.14.26", "jsonwebtoken", - "leaky-bucket", "metrics", "nix 0.27.1", "once_cell", @@ -6983,7 +6818,6 @@ dependencies = [ "serde_assert", "serde_json", "serde_path_to_error", - "serde_with", "signal-hook", "strum", "strum_macros", @@ -6992,7 +6826,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-error", "tracing-subscriber", @@ -7039,13 +6873,11 @@ dependencies = [ "cgroups-rs", "clap", "futures", - "inotify 0.10.2", "serde", "serde_json", "sysinfo", "tokio", "tokio-postgres", - "tokio-stream", "tokio-util", "tracing", "tracing-subscriber", @@ -7072,7 +6904,6 @@ dependencies = [ "clap", "env_logger", "log", - "once_cell", "postgres", "postgres_ffi", "regex", @@ -7538,15 +7369,6 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" -[[package]] -name = "winnow" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699" -dependencies = [ - "memchr", -] - [[package]] name = "winnow" version = "0.6.13" @@ -7604,6 +7426,7 @@ dependencies = [ "digest", "either", "fail", + "futures", "futures-channel", "futures-executor", "futures-io", @@ -7616,6 +7439,7 @@ dependencies = [ "hyper 0.14.26", "indexmap 1.9.3", "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "libc", "log", @@ -7653,10 +7477,13 @@ dependencies = [ "tokio", "tokio-rustls 0.24.0", "tokio-util", + "toml_edit", "tonic", "tower", "tracing", "tracing-core", + "tracing-log", + "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 963841e340..fd1d4e016c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,8 @@ aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.65" +bindgen = "0.70" +bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -72,7 +73,7 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -comfy-table = "6.1" +comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" crossbeam-deque = "0.8.5" @@ -102,18 +103,18 @@ humantime-serde = "1.1.1" hyper = "0.14" tokio-tungstenite = "0.20.0" indexmap = "2" +indoc = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "9" lasso = "0.7" -leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.8" -nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" @@ -122,8 +123,8 @@ opentelemetry = "0.20.0" opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "51.0.0" +parquet = { version = "53", default-features = false, features = ["zstd"] } +parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.16" @@ -145,6 +146,7 @@ rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" +send-future = "0.1.0" sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" @@ -156,11 +158,10 @@ signal-hook = "0.3" smallvec = "1.11" smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" -strum = "0.24" -strum_macros = "0.24" +strum = "0.26" +strum_macros = "0.26" "subtle" = "2.5.0" -# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet -svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" } +svg_fmt = "0.4.3" sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" @@ -176,8 +177,8 @@ tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } -toml = "0.7" -toml_edit = "0.19" +toml = "0.8" +toml_edit = "0.22" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" @@ -200,10 +201,21 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } + +# We want to use the 'neon' branch for these, but there's currently one +# incompatible change on the branch. See: +# +# - PR #8076 which contained changes that depended on the new changes in +# the rust-postgres crate, and +# - PR #8654 which reverted those changes and made the code in proxy incompatible +# with the tip of the 'neon' branch again. +# +# When those proxy changes are re-applied (see PR #8747), we can switch using +# the tip of the 'neon' branch again. +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } @@ -240,11 +252,7 @@ tonic-build = "0.9" [patch.crates-io] # Needed to get `tokio-postgres-rustls` to depend on our fork. -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } - -# bug fixes for UUID -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } -parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index d3d12330c6..bdb76a4f4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned +ARG DEFAULT_PG_VERSION=17 +ARG STABLE_PG_VERSION=16 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -13,6 +15,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 +COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG +ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib +COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -52,6 +58,7 @@ RUN set -e \ # Build final image # FROM debian:bullseye-slim +ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -87,12 +95,13 @@ RUN mkdir -p /data/.neon/ && \ "pg_distrib_dir='/usr/local/'\n" \ "listen_pg_addr='0.0.0.0:6400'\n" \ "listen_http_addr='0.0.0.0:9898'\n" \ + "availability_zone='local'\n" \ > /data/.neon/pageserver.toml && \ chown -R neon:neon /data/.neon # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib VOLUME ["/data"] diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d6beb61369..c4209c7a12 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -192,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.80.1 +ENV RUSTC_VERSION=1.81.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 @@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux export PATH="$HOME/.cargo/bin:$PATH" && \ . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ - rustup component add llvm-tools-preview rustfmt clippy && \ + rustup component add llvm-tools rustfmt clippy && \ cargo install rustfilt --version ${RUSTFILT_VERSION} && \ cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 7acaf2f2fd..6bf6fb650f 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -55,22 +55,27 @@ RUN cd postgres && \ # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. - old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ - # the first loop is for pg_stat_statement extension version <= 1.6 for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ - if echo "$old_list" | grep -q -F "$filename"; then \ + # Note that there are no downgrade scripts for pg_stat_statements, so we \ + # don't have to modify any downgrade paths or (much) older versions: we only \ + # have to make sure every creation of the pg_stat_statements_reset function \ + # also adds execute permissions to the neon_superuser. + case $filename in \ + pg_stat_statements--1.4.sql) \ + # pg_stat_statements_reset is first created with 1.4 echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ - fi; \ - done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, - # where pg_stat_statement_reset() got 3 additional arguments - for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ - filename=$(basename "$file"); \ - if ! echo "$old_list" | grep -q -F "$filename"; then \ + ;; \ + pg_stat_statements--1.6--1.7.sql) \ + # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ - fi; \ - done + ;; \ + pg_stat_statements--1.10--1.11.sql) \ + # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \ + ;; \ + esac; \ + done; ######################################################################################### # @@ -79,6 +84,7 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ @@ -87,7 +93,11 @@ RUN apt update && \ protobuf-c-compiler xsltproc # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /sfcgal && \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ + esac && \ + wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -96,7 +106,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ + esac && \ + wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -122,7 +135,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis -RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -142,12 +158,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti # ######################################################################################### FROM build-deps AS plv8-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts @@ -172,9 +195,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t # ######################################################################################### FROM build-deps AS h3-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "$(uname -m)" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "$(uname -m)" in \ "x86_64") \ export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ ;; \ @@ -192,7 +219,11 @@ RUN case "$(uname -m)" in \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /h3/usr/ && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -202,7 +233,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ @@ -218,9 +252,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # ######################################################################################### FROM build-deps AS unit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -239,6 +277,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - # ######################################################################################### FROM build-deps AS vector-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch @@ -246,7 +285,10 @@ COPY patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ @@ -261,10 +303,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O # ######################################################################################### FROM build-deps AS pgjwt-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 -RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -277,9 +323,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 # ######################################################################################### FROM build-deps AS hypopg-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -293,9 +343,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo # ######################################################################################### FROM build-deps AS pg-hashids-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -309,11 +363,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # ######################################################################################### FROM build-deps AS rum-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/rum.patch /rum.patch -RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ @@ -328,9 +386,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r # ######################################################################################### FROM build-deps AS pgtap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -344,9 +406,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta # ######################################################################################### FROM build-deps AS ip4r-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -360,9 +426,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # ######################################################################################### FROM build-deps AS prefix-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -376,9 +446,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # ######################################################################################### FROM build-deps AS hll-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -392,9 +466,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # ######################################################################################### FROM build-deps AS plpgsql-check-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -413,7 +491,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -446,7 +527,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -459,6 +543,9 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ + "v17") \ + echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -478,10 +565,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-cron-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -495,9 +586,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O # ######################################################################################### FROM build-deps AS rdkit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y \ cmake \ libboost-iostreams1.74-dev \ @@ -507,7 +602,10 @@ RUN apt-get update && \ libeigen3-dev ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ @@ -544,10 +642,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. # ######################################################################################### FROM build-deps AS pg-uuidv7-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -561,10 +663,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz # ######################################################################################### FROM build-deps AS pg-roaringbitmap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \ + esac && \ + wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -578,10 +684,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # ######################################################################################### FROM build-deps AS pg-semver-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -620,10 +730,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-anon-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -641,6 +755,7 @@ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag # ######################################################################################### FROM build-deps AS rust-extensions-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ @@ -651,9 +766,11 @@ ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -672,7 +789,10 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 @@ -694,7 +814,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -714,7 +837,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ # TODO update pgrx version in the pg_tiktoken repo and remove this line @@ -733,7 +859,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -748,10 +877,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - ######################################################################################### FROM build-deps AS wal2json-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ + esac && \ + wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -764,10 +897,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS pg-ivm-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ + esac && \ + wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -781,10 +918,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv # ######################################################################################### FROM build-deps AS pg-partman-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_partman doesn't support PG17 yet" && exit 0;; \ + esac && \ + wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -854,8 +995,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ case "${PG_VERSION}" in \ "v14" | "v15") \ ;; \ - "v16") \ - echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + "v16" | "v17") \ + echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ @@ -899,7 +1040,7 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include @@ -918,7 +1059,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN mkdir /ext-src +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr @@ -942,7 +1086,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src -COPY patches/pg_hintplan.patch /ext-src +COPY patches/pg_hint_plan.patch /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src @@ -956,18 +1100,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src COPY patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch # cmake is required for the h3 test -RUN apt-get update && apt-get install -y cmake -RUN patch -p1 < /ext-src/pg_hintplan.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && apt-get install -y cmake +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 > ~/.zshrc ``` +If you get errors about missing `m4` you may have to install it manually: +``` +brew install m4 +brew link --force m4 +``` + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install @@ -126,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. +Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8af0ed43ce..00a82e4be6 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,7 +11,6 @@ testing = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true @@ -24,7 +23,6 @@ num_cpus.workspace = true opentelemetry.workspace = true postgres.workspace = true regex.workspace = true -serde.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,7 +41,6 @@ url.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true -toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 0ba2c1aeb4..9499a7186e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,6 +44,7 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info, warn}; @@ -366,6 +367,8 @@ fn wait_spec( state.start_time = now; } + launch_lsn_lease_bg_task_for_static(&compute); + Ok(WaitSpecResult { compute, http_port, diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5bd6897fe3..1f47bb58a3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1052,26 +1052,19 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary { - if !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override( - pgdata_path, - "neon.max_cluster_size=-1", - || { - self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - }, - )?; + if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; - } - self.post_apply_config()?; + + self.apply_config(&compute_state)?; + + Ok(()) + })?; + self.pg_reload_conf()?; } let startup_end_time = Utc::now(); diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ef1db73982..6ef7e0837f 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -124,6 +124,7 @@ fn parse_pg_version(human_version: &str) -> &str { "14" => return "v14", "15" => return "v15", "16" => return "v16", + "17" => return "v17", _ => {} }, _ => {} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 543d4462ed..c402d63305 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod extension_server; +pub mod lsn_lease; mod migration; pub mod monitor; pub mod params; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs new file mode 100644 index 0000000000..7e5917c55f --- /dev/null +++ b/compute_tools/src/lsn_lease.rs @@ -0,0 +1,186 @@ +use anyhow::bail; +use anyhow::Result; +use postgres::{NoTls, SimpleQueryMessage}; +use std::time::SystemTime; +use std::{str::FromStr, sync::Arc, thread, time::Duration}; +use utils::id::TenantId; +use utils::id::TimelineId; + +use compute_api::spec::ComputeMode; +use tracing::{info, warn}; +use utils::{ + lsn::Lsn, + shard::{ShardCount, ShardNumber, TenantShardId}, +}; + +use crate::compute::ComputeNode; + +/// Spawns a background thread to periodically renew LSN leases for static compute. +/// Do nothing if the compute is not in static mode. +pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc) { + let (tenant_id, timeline_id, lsn) = { + let state = compute.state.lock().unwrap(); + let spec = state.pspec.as_ref().expect("Spec must be set"); + match spec.spec.mode { + ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn), + _ => return, + } + }; + let compute = compute.clone(); + + let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn); + thread::spawn(move || { + let _entered = span.entered(); + if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) { + // TODO: might need stronger error feedback than logging an warning. + warn!("Exited with error: {e}"); + } + }); +} + +/// Renews lsn lease periodically so static compute are not affected by GC. +fn lsn_lease_bg_task( + compute: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result<()> { + loop { + let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?; + let valid_duration = valid_until + .duration_since(SystemTime::now()) + .unwrap_or(Duration::ZERO); + + // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration. + let sleep_duration = valid_duration + .saturating_sub(Duration::from_secs(60)) + .max(valid_duration / 2); + + info!( + "Succeeded, sleeping for {} seconds", + sleep_duration.as_secs() + ); + thread::sleep(sleep_duration); + } +} + +/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted. +/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests. +fn acquire_lsn_lease_with_retry( + compute: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result { + let mut attempts = 0usize; + let mut retry_period_ms: f64 = 500.0; + const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0; + + loop { + // Note: List of pageservers is dynamic, need to re-read configs before each attempt. + let configs = { + let state = compute.state.lock().unwrap(); + + let spec = state.pspec.as_ref().expect("spec must be set"); + + let conn_strings = spec.pageserver_connstr.split(','); + + conn_strings + .map(|connstr| { + let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); + if let Some(storage_auth_token) = &spec.storage_auth_token { + info!("Got storage auth token from spec file"); + config.password(storage_auth_token.clone()); + } else { + info!("Storage auth token not set"); + } + config + }) + .collect::>() + }; + + let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + match result { + Ok(Some(res)) => { + return Ok(res); + } + Ok(None) => { + bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); + } + Err(e) => { + warn!("Failed to acquire lsn lease: {e} (attempt {attempts}"); + + thread::sleep(Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); + } + } + attempts += 1; + } +} + +/// Tries to acquire an LSN lease through PS page_service API. +fn try_acquire_lsn_lease( + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + configs: &[postgres::Config], +) -> Result> { + fn get_valid_until( + config: &postgres::Config, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), + }; + + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); + Ok(valid_until) + } + + let shard_count = configs.len(); + + let valid_until = if shard_count > 1 { + configs + .iter() + .enumerate() + .map(|(shard_number, config)| { + let tenant_shard_id = TenantShardId { + tenant_id, + shard_count: ShardCount::new(shard_count as u8), + shard_number: ShardNumber(shard_number as u8), + }; + get_valid_until(config, tenant_shard_id, timeline_id, lsn) + }) + .collect::>>>()? + .into_iter() + .min() + .unwrap() + } else { + get_valid_until( + &configs[0], + TenantShardId::unsharded(tenant_id), + timeline_id, + lsn, + )? + }; + + Ok(valid_until) +} diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql new file mode 100644 index 0000000000..425ed8cd3d --- /dev/null +++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 863fa9468f..b2dc265864 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Escape a string for including it in a SQL literal. Wrapping the result -/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use -/// SQL string literal, e.g. `'db'''` or `E'db\\'`. +/// Escape a string for including it in a SQL literal. +/// +/// Wrapping the result with `E'{}'` or `'{}'` is not required, +/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`. /// See /// for the original implementation. pub fn escape_literal(s: &str) -> String { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6a87263821..aa9405d28d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { include_str!( "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), + include_str!( + "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" + ), ]; MigrationRunner::new(client, &migrations).run_migrations()?; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 487ac8f047..c185d20484 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -6,17 +6,13 @@ license.workspace = true [dependencies] anyhow.workspace = true -async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true -futures.workspace = true git-version.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true -postgres.workspace = true -hex.workspace = true humantime-serde.workspace = true hyper.workspace = true regex.workspace = true @@ -24,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] } scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -tar.workspace = true thiserror.workspace = true toml.workspace = true toml_edit.workspace = true diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 619c5bce3e..94a072e394 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -151,7 +151,7 @@ where print!("."); io::stdout().flush().unwrap(); } - thread::sleep(RETRY_INTERVAL); + tokio::time::sleep(RETRY_INTERVAL).await; } Err(e) => { println!("error starting process {process_name:?}: {e:#}"); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1d66532d49..92f609761a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -34,12 +34,14 @@ use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; +use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; +use tokio::task::JoinSet; use url::Host; use utils::{ auth::{Claims, Scope}, @@ -87,34 +89,35 @@ fn main() -> Result<()> { // Check for 'neon init' command first. let subcommand_result = if sub_name == "init" { - handle_init(sub_args).map(Some) + handle_init(sub_args).map(|env| Some(Cow::Owned(env))) } else { // all other commands need an existing config - let mut env = - LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; - let original_env = env.clone(); + let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; + let original_env = env.clone(); + let env = Box::leak(Box::new(env)); let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); let subcommand_result = match sub_name { - "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), - "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))), - "stop" => rt.block_on(handle_stop_all(sub_args, &env)), - "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), - "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), - "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), - "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), - "mappings" => handle_mappings(sub_args, &mut env), + "tenant" => rt.block_on(handle_tenant(sub_args, env)), + "timeline" => rt.block_on(handle_timeline(sub_args, env)), + "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))), + "stop" => rt.block_on(handle_stop_all(sub_args, env)), + "pageserver" => rt.block_on(handle_pageserver(sub_args, env)), + "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)), + "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)), + "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)), + "endpoint" => rt.block_on(handle_endpoint(sub_args, env)), + "mappings" => handle_mappings(sub_args, env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), }; - if original_env != env { - subcommand_result.map(|()| Some(env)) + if &original_env != env { + subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) } @@ -640,6 +643,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; + let new_timeline_id = + parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate()); let new_branch_name = branch_match .get_one::("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; @@ -658,7 +663,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let new_timeline_id = TimelineId::generate(); let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, @@ -1244,49 +1248,122 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } +async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match sub_match.subcommand() { + Some(broker_command_data) => broker_command_data, + None => bail!("no broker subcommand provided"), + }; + + match sub_name { + "start" => { + if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await { + eprintln!("broker start failed: {e}"); + exit(1); + } + } + + "stop" => { + if let Err(e) = broker::stop_broker_process(env) { + eprintln!("broker stop failed: {e}"); + exit(1); + } + } + + _ => bail!("Unexpected broker subcommand '{}'", sub_name), + } + Ok(()) +} + async fn handle_start_all( - env: &local_env::LocalEnv, + env: &'static local_env::LocalEnv, retry_timeout: &Duration, ) -> anyhow::Result<()> { + let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else { + neon_start_status_check(env, retry_timeout) + .await + .context("status check after successful startup of all services")?; + return Ok(()); + }; + + eprintln!("startup failed because one or more services could not be started"); + + for e in errors { + eprintln!("{e}"); + let debug_repr = format!("{e:?}"); + for line in debug_repr.lines() { + eprintln!(" {line}"); + } + } + + try_stop_all(env, true).await; + + exit(2); +} + +/// Returns Ok() if and only if all services could be started successfully. +/// Otherwise, returns the list of errors that occurred during startup. +async fn handle_start_all_impl( + env: &'static local_env::LocalEnv, + retry_timeout: Duration, +) -> Result<(), Vec> { // Endpoints are not started automatically - broker::start_broker_process(env, retry_timeout).await?; + let mut js = JoinSet::new(); - // Only start the storage controller if the pageserver is configured to need it - if env.control_plane_api.is_some() { - let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller - .start(NeonStorageControllerStartArgs::with_default_instance_id( - (*retry_timeout).into(), - )) - .await - { - eprintln!("storage_controller start failed: {:#}", e); - try_stop_all(env, true).await; - exit(1); + // force infalliblity through closure + #[allow(clippy::redundant_closure_call)] + (|| { + js.spawn(async move { + let retry_timeout = retry_timeout; + broker::start_broker_process(env, &retry_timeout).await + }); + + // Only start the storage controller if the pageserver is configured to need it + if env.control_plane_api.is_some() { + js.spawn(async move { + let storage_controller = StorageController::from_env(env); + storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + retry_timeout.into(), + )) + .await + .map_err(|e| e.context("start storage_controller")) + }); + } + + for ps_conf in &env.pageservers { + js.spawn(async move { + let pageserver = PageServerNode::from_env(env, ps_conf); + pageserver + .start(&retry_timeout) + .await + .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id))) + }); + } + + for node in env.safekeepers.iter() { + js.spawn(async move { + let safekeeper = SafekeeperNode::from_env(env, node); + safekeeper + .start(vec![], &retry_timeout) + .await + .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) + }); + } + })(); + + let mut errors = Vec::new(); + while let Some(result) = js.join_next().await { + let result = result.expect("we don't panic or cancel the tasks"); + if let Err(e) = result { + errors.push(e); } } - for ps_conf in &env.pageservers { - let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver.start(retry_timeout).await { - eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true).await; - exit(1); - } + if !errors.is_empty() { + return Err(errors); } - for node in env.safekeepers.iter() { - let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![], retry_timeout).await { - eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false).await; - exit(1); - } - } - - neon_start_status_check(env, retry_timeout).await?; - Ok(()) } @@ -1570,7 +1647,6 @@ fn cli() -> Command { .value_parser(value_parser!(PathBuf)) .value_name("config") ) - .arg(pg_version_arg.clone()) .arg(force_arg) ) .subcommand( @@ -1583,6 +1659,7 @@ fn cli() -> Command { .subcommand(Command::new("branch") .about("Create a new timeline, using another timeline as a base, copying its data") .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) @@ -1671,6 +1748,19 @@ fn cli() -> Command { .arg(stop_mode_arg.clone()) .arg(instance_id)) ) + .subcommand( + Command::new("storage_broker") + .arg_required_else_help(true) + .about("Manage broker") + .subcommand(Command::new("start") + .about("Start broker") + .arg(timeout_arg.clone()) + ) + .subcommand(Command::new("stop") + .about("Stop broker") + .arg(stop_mode_arg.clone()) + ) + ) .subcommand( Command::new("safekeeper") .arg_required_else_help(true) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 9f879c4b08..7554a03a68 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -702,7 +702,7 @@ impl Endpoint { } } } - std::thread::sleep(ATTEMPT_INTERVAL); + tokio::time::sleep(ATTEMPT_INTERVAL).await; } // disarm the scopeguard, let the child outlive this function (and neon_local invoction) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 74caba2b56..d616154af6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf { pub split_threshold: Option, pub max_secondary_lag_bytes: Option, + + #[serde(with = "humantime_serde")] + pub heartbeat_interval: Duration, } impl NeonStorageControllerConf { @@ -172,6 +175,9 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); + + // Very tight heartbeat interval to speed up tests + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); } impl Default for NeonStorageControllerConf { @@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf { database_url: None, split_threshold: None, max_secondary_lag_bytes: None, + heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, } } } @@ -335,7 +342,7 @@ impl LocalEnv { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 399b1c2653..33ca70af96 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -75,14 +75,14 @@ impl PageServerNode { } } - fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { - toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut { + toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap() } fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, - ) -> anyhow::Result { + ) -> anyhow::Result { assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) @@ -137,9 +137,9 @@ impl PageServerNode { // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. - let mut config_toml = toml_edit::Document::new(); + let mut config_toml = toml_edit::DocumentMut::new(); for fragment_str in overrides { - let fragment = toml_edit::Document::from_str(&fragment_str) + let fragment = toml_edit::DocumentMut::from_str(&fragment_str) .expect("all fragments in `overrides` are valid toml documents, this function controls that"); for (key, item) in fragment.iter() { config_toml.insert(key, item.clone()); @@ -181,6 +181,23 @@ impl PageServerNode { ); io::stdout().flush()?; + // If the config file we got as a CLI argument includes the `availability_zone` + // config, then use that to populate the `metadata.json` file for the pageserver. + // In production the deployment orchestrator does this for us. + let az_id = conf + .other + .get("availability_zone") + .map(|toml| { + let az_str = toml.to_string(); + // Trim the (") chars from the toml representation + if az_str.starts_with('"') && az_str.ends_with('"') { + az_str[1..az_str.len() - 1].to_string() + } else { + az_str + } + }) + .unwrap_or("local".to_string()); + let config = self .pageserver_init_make_toml(conf) .context("make pageserver toml")?; @@ -216,6 +233,7 @@ impl PageServerNode { let (_http_host, http_port) = parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); let http_port = http_port.unwrap_or(9898); + // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. @@ -226,7 +244,10 @@ impl PageServerNode { postgres_port: self.pg_connection_config.port(), http_host: "localhost".to_string(), http_port, - other: HashMap::new(), + other: HashMap::from([( + "availability_zone_id".to_string(), + serde_json::json!(az_id), + )]), }) .unwrap(), ) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index a0a73f5609..573f1688d5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -5,6 +5,7 @@ //! ```text //! .neon/safekeepers/ //! ``` +use std::future::Future; use std::io::Write; use std::path::PathBuf; use std::time::Duration; @@ -34,12 +35,10 @@ pub enum SafekeeperHttpError { type Result = result::Result; -#[async_trait::async_trait] -pub trait ResponseErrorMessageExt: Sized { - async fn error_from_body(self) -> Result; +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl Future> + Send; } -#[async_trait::async_trait] impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 27d8e2de0c..2b714fbfbf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -28,6 +28,7 @@ use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId}, }; +use whoami::username; pub struct StorageController { env: LocalEnv, @@ -183,7 +184,7 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; for v in prefer_versions { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); @@ -211,7 +212,16 @@ impl StorageController { /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let args = [ + "-h", + "localhost", + "-U", + &username(), + "-d", + DB_NAME, + "-p", + &format!("{}", postgres_port), + ]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -225,7 +235,11 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!( + "postgresql://{}@localhost:{}/{DB_NAME}", + &username(), + postgres_port + ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -235,6 +249,10 @@ impl StorageController { "localhost", "-p", &format!("{}", postgres_port), + "-U", + &username(), + "-O", + &username(), DB_NAME, ]) .output() @@ -271,7 +289,7 @@ impl StorageController { // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 - .user(&whoami::username()) + .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await @@ -328,6 +346,12 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { + let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + tracing::info!( + "Initializing storage controller database with args: {:?}", + initdb_args + ); + // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) @@ -335,7 +359,7 @@ impl StorageController { ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) - .args(["-D", pg_data_path.as_ref()]) + .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; @@ -364,8 +388,14 @@ impl StorageController { pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), + "-U", + &username(), "start", ]; + tracing::info!( + "Starting storage controller database with args: {:?}", + db_start_args + ); background_process::start_process( "storage_controller_db", @@ -437,6 +467,8 @@ impl StorageController { &humantime::Duration::from(self.config.max_offline).to_string(), "--max-warming-up-interval", &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--heartbeat-interval", + &humantime::Duration::from(self.config.heartbeat_interval).to_string(), "--address-for-peers", &address_for_peers.to_string(), ] diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index be69208d0d..ce89116691 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -11,14 +11,11 @@ clap.workspace = true comfy-table.workspace = true futures.workspace = true humantime.workspace = true -hyper.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true reqwest.workspace = true -serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } storage_controller_client.workspace = true -thiserror.workspace = true tokio.workspace = true tracing.workspace = true utils.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index e27491c1c8..651fcda8db 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ - NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, - TenantDescribeResponse, TenantPolicyRequest, + NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -41,6 +41,8 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + availability_zone_id: String, }, /// Modify a node's configuration in the storage controller @@ -78,7 +80,10 @@ enum Command { /// List nodes known to the storage controller Nodes {}, /// List tenants known to the storage controller - Tenants {}, + Tenants { + /// If this field is set, it will list the tenants on a specific node + node_id: Option, + }, /// Create a new tenant in the storage controller, and by extension on pageservers. TenantCreate { #[arg(long)] @@ -147,9 +152,9 @@ enum Command { #[arg(long)] threshold: humantime::Duration, }, - // Drain a set of specified pageservers by moving the primary attachments to pageservers + // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers // outside of the specified set. - Drain { + BulkMigrate { // Set of pageserver node ids to drain. #[arg(long)] nodes: Vec, @@ -163,6 +168,34 @@ enum Command { #[arg(long)] dry_run: Option, }, + /// Start draining the specified pageserver. + /// The drain is complete when the schedulling policy returns to active. + StartDrain { + #[arg(long)] + node_id: NodeId, + }, + /// Cancel draining the specified pageserver and wait for `timeout` + /// for the operation to be canceled. May be retried. + CancelDrain { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + timeout: humantime::Duration, + }, + /// Start filling the specified pageserver. + /// The drain is complete when the schedulling policy returns to active. + StartFill { + #[arg(long)] + node_id: NodeId, + }, + /// Cancel filling the specified pageserver and wait for `timeout` + /// for the operation to be canceled. May be retried. + CancelFill { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + timeout: humantime::Duration, + }, } #[derive(Parser)] @@ -249,6 +282,34 @@ impl FromStr for NodeAvailabilityArg { } } +async fn wait_for_scheduling_policy( + client: Client, + node_id: NodeId, + timeout: Duration, + f: F, +) -> anyhow::Result +where + F: Fn(NodeSchedulingPolicy) -> bool, +{ + let waiter = tokio::time::timeout(timeout, async move { + loop { + let node = client + .dispatch::<(), NodeDescribeResponse>( + Method::GET, + format!("control/v1/node/{node_id}"), + None, + ) + .await?; + + if f(node.scheduling) { + return Ok::(node.scheduling); + } + } + }); + + Ok(waiter.await??) +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); @@ -266,6 +327,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + availability_zone_id, } => { storcon_client .dispatch::<_, ()>( @@ -277,6 +339,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + availability_zone_id, }), ) .await?; @@ -343,7 +406,41 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::Tenants {} => { + Command::Tenants { + node_id: Some(node_id), + } => { + let describe_response = storcon_client + .dispatch::<(), NodeShardResponse>( + Method::GET, + format!("control/v1/node/{node_id}/shards"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header([ + "Shard", + "Intended Primary/Secondary", + "Observed Primary/Secondary", + ]); + for shard in shards { + table.add_row([ + format!("{}", shard.tenant_shard_id), + match shard.is_intended_secondary { + None => "".to_string(), + Some(true) => "Secondary".to_string(), + Some(false) => "Primary".to_string(), + }, + match shard.is_observed_secondary { + None => "".to_string(), + Some(true) => "Secondary".to_string(), + Some(false) => "Primary".to_string(), + }, + ]); + } + println!("{table}"); + } + Command::Tenants { node_id: None } => { let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, @@ -628,7 +725,7 @@ async fn main() -> anyhow::Result<()> { }) .await?; } - Command::Drain { + Command::BulkMigrate { nodes, concurrency, max_shards, @@ -657,7 +754,7 @@ async fn main() -> anyhow::Result<()> { } if nodes.len() != node_to_drain_descs.len() { - anyhow::bail!("Drain requested for node which doesn't exist.") + anyhow::bail!("Bulk migration requested away from node which doesn't exist.") } node_to_fill_descs.retain(|desc| { @@ -669,7 +766,7 @@ async fn main() -> anyhow::Result<()> { }); if node_to_fill_descs.is_empty() { - anyhow::bail!("There are no nodes to drain to") + anyhow::bail!("There are no nodes to migrate to") } // Set the node scheduling policy to draining for the nodes which @@ -690,7 +787,7 @@ async fn main() -> anyhow::Result<()> { .await?; } - // Perform the drain: move each tenant shard scheduled on a node to + // Perform the migration: move each tenant shard scheduled on a node to // be drained to a node which is being filled. A simple round robin // strategy is used to pick the new node. let tenants = storcon_client @@ -703,13 +800,13 @@ async fn main() -> anyhow::Result<()> { let mut selected_node_idx = 0; - struct DrainMove { + struct MigrationMove { tenant_shard_id: TenantShardId, from: NodeId, to: NodeId, } - let mut moves: Vec = Vec::new(); + let mut moves: Vec = Vec::new(); let shards = tenants .into_iter() @@ -739,7 +836,7 @@ async fn main() -> anyhow::Result<()> { continue; } - moves.push(DrainMove { + moves.push(MigrationMove { tenant_shard_id: shard.tenant_shard_id, from: shard .node_attached @@ -816,6 +913,67 @@ async fn main() -> anyhow::Result<()> { failure ); } + Command::StartDrain { node_id } => { + storcon_client + .dispatch::<(), ()>( + Method::PUT, + format!("control/v1/node/{node_id}/drain"), + None, + ) + .await?; + println!("Drain started for {node_id}"); + } + Command::CancelDrain { node_id, timeout } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("control/v1/node/{node_id}/drain"), + None, + ) + .await?; + + println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); + + let final_policy = + wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { + use NodeSchedulingPolicy::*; + matches!(sched, Active | PauseForRestart) + }) + .await?; + + println!( + "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" + ); + } + Command::StartFill { node_id } => { + storcon_client + .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None) + .await?; + + println!("Fill started for {node_id}"); + } + Command::CancelFill { node_id, timeout } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("control/v1/node/{node_id}/fill"), + None, + ) + .await?; + + println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); + + let final_policy = + wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { + use NodeSchedulingPolicy::*; + matches!(sched, Active) + }) + .await?; + + println!( + "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" + ); + } } Ok(()) diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 58b2581197..3fc0b90071 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -3,7 +3,7 @@ set -x cd /ext-src || exit 2 FAILED= -LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) +LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) for d in ${LIST} do [ -d "${d}" ] || continue diff --git a/docs/rfcs/037-storage-controller-restarts.md b/docs/rfcs/037-storage-controller-restarts.md new file mode 100644 index 0000000000..bad422344f --- /dev/null +++ b/docs/rfcs/037-storage-controller-restarts.md @@ -0,0 +1,259 @@ +# Rolling Storage Controller Restarts + +## Summary + +This RFC describes the issues around the current storage controller restart procedure +and describes an implementation which reduces downtime to a few milliseconds on the happy path. + +## Motivation + +Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps. +While the storage controller does not sit on the main data path, it's generally not acceptable +to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034). + +### Current Implementation + +The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment). +In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after, +a new instance is created. + +At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the +latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds +under unfavourable circumstances: pageservers are heavily loaded or unavailable. + +## Prior Art + +There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include: +* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them. +For fail-over, traffic is routed to one of the standbys (which becomes active). +* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other +and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs). + +## Requirements + +* Reduce storage controller unavailability during upgrades to milliseconds +* Minimize the interval in which it's possible for more than one storage controller +to issue reconciles. +* Have one uniform implementation for restarts and upgrades +* Fit in with the current Kubernetes deployment scheme + +## Non Goals + +* Implement our own consensus algorithm from scratch +* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks +like a transient error to the control plane + +## Impacted Components + +* storage controller +* deployment orchestration (i.e. Ansible) +* helm charts + +## Terminology + +* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up +at start-up by quering pageservers +* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models +a set of replicas + +## Implementation + +### High Level Flow + +At a very high level the proposed idea is to start a new storage controller instance while +the previous one is still running and cut-over to it when it becomes ready. The new instance, +should coordinate with the existing one and transition responsibility gracefully. While the controller +has built in safety against split-brain situations (via generation numbers), we'd like to avoid such +scenarios since they can lead to availability issues for tenants that underwent changes while two controllers +were operating at the same time and require operator intervention to remedy. + +### Kubernetes Deployment Configuration + +On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment` +to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`. +Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not +scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`). + +The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section. + +### Storage Controller Start-Up + +This section describes the primitives required on the storage controller side and the flow of the happy path. + +#### Database Table For Leader Synchronization + +A new table should be added to the storage controller database for leader synchronization during startup. +This table will always contain at most one row. The proposed name for the table is `leader` and the schema +contains two elements: +* `hostname`: represents the hostname for the current storage controller leader - should be addressible +from other pods in the deployment +* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required +for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness) + +Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader +at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the +situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation +level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE +READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently, +the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits +our needs here. + +``` +START TRANSACTION ISOLATION LEVEL REPEATABLE READ +UPDATE leader SET hostname=, start_timestamp= +WHERE hostname=, start_timestampt=; +``` + +If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure. + +#### Step Down API + +A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this +request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs +and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized +snapshot of the observed state. + +If other step down requests come in after the initial one, the request is handled and the observed state is returned (required +for failure scenario handling - see [Handling Failures](#handling-failures)). + +#### Graceful Restart Happy Path + +At start-up, the first thing the storage controller does is retrieve the sole row from the new +`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader. +This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the +observed state into memory and the start-up sequence proceeds as usual, but *without* querying the +pageservers in order to build up the observed state. + +Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization) +section. If this step fails, the storage controller process exits. + +Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table +(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers). + +Summary of proposed new start-up sequence: +1. Call `/step_down` +2. Perform any pending database migrations +3. Load state from database +4. Load observed state returned in step (1) into memory +5. Do initial heartbeat round (may be moved after 5) +7. Mark self as leader by updating the database +8. Reschedule and reconcile everything + +Some things to note from the steps above: +* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config +calls to the pageserver and no compute notifications) +* Ask the current leader to step down before loading state from database so we don't get a lost update +if the transactions overlap. +* Before loading the observed state at step (3), cross-validate against the database. If validation fails, +fall back to asking the pageservers about their current locations. +* Database migrations should only run **after** the previous instance steps down (or the step down times out). + + +[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)), +so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case. + +### Handling Failures + +#### Storage Controller Crash Or Restart + +The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to +`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing +start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller +exists and consistency is maintained. + +#### Previous Leader Crashes Before New Leader Readiness + +When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will +reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1' +(see [2]). + +Now we have two cases to consider: +* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated +by Kubernetes depending on timings. +* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes. +The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will +create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent. + +[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation +should avoid this self reference and fail the API call at the client if the persisted hostname matches +the current one. + +#### Previous Leader Crashes After New Leader Readiness + +The deployment's replica sets already satisfy the deployment's replica count requirements and the +Kubernetes deployment rollout will just clean up the dead pod. + +#### New Leader Crashes Before Pasing Readiness Check + +The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated +with the new pod. + +#### Network Partition Between New Pod and Previous Leader + +This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down` +API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table. +Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles. + +### Dealing With Split Brain Scenarios + +As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain +duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these +scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening. +The rest of this section sketches some safety measure. It's likely overkill to implement all of them however. + +### Ensure Leadership Before Producing Side Effects + +The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane. +Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be +applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases. + +### Leadership Lease + +Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership +to be renewed periodically. Two new columns would be added to the leaders table: +1. `last_renewed` - timestamp indicating when the lease was last renewed +2. `lease_duration` - duration indicating the amount of time after which the lease expires + +The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the +same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease +to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request. + +### Notify Pageserver Of Storage Controller Term + +Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader. +Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse +anything which contains a stale term (i.e. smaller than the current one). + +### Observability + +* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`). +Per region alerts should be added on this metric which triggers when: + + no storage controller has been in the `Active` state for an extended period of time + + more than one storage controllers are in the `Active` state + +* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful. +We'd have to expose the storage controller read only database to Grafana (perhaps it is already done). + +## Alternatives + +### Kubernetes Leases + +Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election. +Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period. + +In our case, it would work something like this: +* `/step_down` deletes the lease or stops it from renewing +* lease acquisition becomes part of the start-up procedure + +The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still +not exactly trivial to implement. + +This approach has the benefit of baked in observability (`kubectl describe lease`), but: +* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong. +* More code surface than the simple "row in database" approach. Also, most of this code would be in +a dependency not subject to code review, etc. +* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do +so is not simple and complictes and the test set-up. + +To my mind, the "row in database" approach is straightforward enough that we don't have to offload this +to something external. diff --git a/docs/updating-postgres.md b/docs/updating-postgres.md index 1868bbf5f7..7913b0a9e2 100644 --- a/docs/updating-postgres.md +++ b/docs/updating-postgres.md @@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._ 1. Create a new branch based on the stable branch you are updating. ```shell - git checkout -b my-branch REL_15_STABLE_neon + git checkout -b my-branch-15 REL_15_STABLE_neon ``` -1. Tag the last commit on the stable branch you are updating. +1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`. - ```shell - git tag REL_15_3_neon - ``` - -1. Push the new tag to the Neon Postgres repository. - - ```shell - git push origin REL_15_3_neon - ``` - -1. Find the release tags you're looking for. They are of the form `REL_X_Y`. - -1. Rebase the branch you created on the tag and resolve any conflicts. +1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts. ```shell git fetch upstream REL_15_4 - git rebase REL_15_4 + git merge REL_15_4 ``` + In the commit message of the merge commit, mention if there were + any non-trivial conflicts or other issues. + 1. Run the Postgres test suite to make sure our commits have not affected Postgres in a negative way. @@ -57,7 +48,7 @@ Postgres in a negative way. 1. Push your branch to the Neon Postgres repository. ```shell - git push origin my-branch + git push origin my-branch-15 ``` 1. Clone the Neon repository if you have not done so already. @@ -74,7 +65,7 @@ branch. 1. Update the Git submodule. ```shell - git submodule set-branch --branch my-branch vendor/postgres-v15 + git submodule set-branch --branch my-branch-15 vendor/postgres-v15 git submodule update --remote vendor/postgres-v15 ``` @@ -89,14 +80,12 @@ minor Postgres release. 1. Create a pull request, and wait for CI to go green. -1. Force push the rebased Postgres branches into the Neon Postgres repository. +1. Push the Postgres branches with the merge commits into the Neon Postgres repository. ```shell - git push --force origin my-branch:REL_15_STABLE_neon + git push origin my-branch-15:REL_15_STABLE_neon ``` - It may require disabling various branch protections. - 1. Update your Neon PR to point at the branches. ```shell diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 8aaa481f8c..c0ec40a6c2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true anyhow.workspace = true chrono.workspace = true serde.workspace = true -serde_with.workspace = true serde_json.workspace = true regex.workspace = true diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index a40b74b952..0e517e3856 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -5,9 +5,6 @@ edition = "2021" license = "Apache-2.0" [dependencies] -anyhow.workspace = true chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true -serde_with.workspace = true -utils.workspace = true diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml index 0c4be90267..473f3a2a13 100644 --- a/libs/desim/Cargo.toml +++ b/libs/desim/Cargo.toml @@ -12,5 +12,4 @@ bytes.workspace = true utils.workspace = true parking_lot.workspace = true hex.workspace = true -scopeguard.workspace = true smallvec = { workspace = true, features = ["write"] } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index df000cd0fb..cd4526c089 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -68,6 +68,7 @@ macro_rules! register_uint_gauge { static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. pub fn register_internal(c: Box) -> prometheus::Result<()> { diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index cb28359ac3..8710904cec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +# See pageserver/Cargo.toml +testing = ["dep:nix"] + [dependencies] serde.workspace = true serde_with.workspace = true @@ -23,6 +27,12 @@ thiserror.workspace = true humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true +storage_broker.workspace = true +camino = {workspace = true, features = ["serde1"]} +remote_storage.workspace = true +postgres_backend.workspace = true +nix = {workspace = true, optional = true} +reqwest.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index d996a62349..1194ee93ef 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -1,15 +1,28 @@ -use std::collections::HashMap; - -use const_format::formatcp; +use camino::Utf8PathBuf; #[cfg(test)] mod tests; +use const_format::formatcp; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use postgres_backend::AuthType; +use remote_storage::RemoteStorageConfig; +use serde_with::serde_as; +use std::{ + collections::HashMap, + num::{NonZeroU64, NonZeroUsize}, + str::FromStr, + time::Duration, +}; +use utils::logging::LogFormat; + +use crate::models::ImageCompressionAlgorithm; +use crate::models::LsnLease; + // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver // itself, it is only used for registering the pageserver with the control @@ -29,3 +42,476 @@ pub struct NodeMetadata { #[serde(flatten)] pub other: HashMap, } + +/// `pageserver.toml` +/// +/// We use serde derive with `#[serde(default)]` to generate a deserializer +/// that fills in the default values for each config field. +/// +/// If there cannot be a static default value because we need to make runtime +/// checks to determine the default, make it an `Option` (which defaults to None). +/// The runtime check should be done in the consuming crate, i.e., `pageserver`. +#[serde_as] +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ConfigToml { + // types mapped 1:1 into the runtime PageServerConfig type + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub availability_zone: Option, + #[serde(with = "humantime_serde")] + pub wait_lsn_timeout: Duration, + #[serde(with = "humantime_serde")] + pub wal_redo_timeout: Duration, + pub superuser: String, + pub page_cache_size: usize, + pub max_file_descriptors: usize, + pub pg_distrib_dir: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub http_auth_type: AuthType, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub pg_auth_type: AuthType, + pub auth_validation_public_key_path: Option, + pub remote_storage: Option, + pub tenant_config: TenantConfigToml, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub broker_endpoint: storage_broker::Uri, + #[serde(with = "humantime_serde")] + pub broker_keepalive_interval: Duration, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub log_format: LogFormat, + pub concurrent_tenant_warmup: NonZeroUsize, + pub concurrent_tenant_size_logical_size_queries: NonZeroUsize, + #[serde(with = "humantime_serde")] + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, + #[serde(with = "humantime_serde")] + pub synthetic_size_calculation_interval: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, + pub ondemand_download_behavior_treat_error_as_warn: bool, + #[serde(with = "humantime_serde")] + pub background_task_maximum_delay: Duration, + pub control_plane_api: Option, + pub control_plane_api_token: Option, + pub control_plane_emergency_mode: bool, + pub heatmap_upload_concurrency: usize, + pub secondary_download_concurrency: usize, + pub virtual_file_io_engine: Option, + pub ingest_batch_size: u64, + pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub image_compression: ImageCompressionAlgorithm, + pub ephemeral_bytes_per_memory_kb: usize, + pub l0_flush: Option, + #[serde(skip_serializing)] + // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's + pub compact_level0_phase1_value_access: serde::de::IgnoredAny, + pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, + pub io_buffer_alignment: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: utils::serde_percent::Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +pub mod statvfs { + pub mod mock { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + #[cfg(feature = "testing")] + Failure { mocked_error: MockedError }, + } + + #[cfg(feature = "testing")] + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + #[cfg(feature = "testing")] + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + RelativeAccessed { + highest_layer_count_loses_first: bool, + }, +} + +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +/// A tenant's calcuated configuration, which is the result of merging a +/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. +/// +/// For storing and transmitting individual tenant's configuration, see +/// TenantConfOpt. +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields, default)] +pub struct TenantConfigToml { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Duration, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + // Duration::ZERO means automatic compaction is disabled. + #[serde(with = "humantime_serde")] + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + // Duration::ZERO means automatic GC is disabled + #[serde(with = "humantime_serde")] + pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] + pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, + pub eviction_policy: crate::models::EvictionPolicy, + pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] + pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: crate::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. + pub switch_aux_file_policy: crate::models::AuxFilePolicy, + + /// The length for an explicit LSN lease request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Duration, + + /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Duration, +} + +pub mod defaults { + use crate::models::ImageCompressionAlgorithm; + + pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; + pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; + + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + + pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; + pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1; + + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; + pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::Zstd { level: Some(1) }; + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; + + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + + pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; +} + +impl Default for ConfigToml { + fn default() -> Self { + use defaults::*; + + Self { + listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: (None), + wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: (DEFAULT_SUPERUSER.to_string()), + page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), + pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() + http_auth_type: (AuthType::Trust), + pg_auth_type: (AuthType::Trust), + auth_validation_public_key_path: (None), + remote_storage: None, + broker_endpoint: (storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: (humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), + log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), + concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(), + metric_collection_interval: (humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + synthetic_size_calculation_interval: (humantime::parse_duration( + DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, + ) + .expect("cannot parse default synthetic size calculation interval")), + metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT), + + metric_collection_bucket: (None), + + disk_usage_based_eviction: (None), + + test_remote_failures: (0), + + ondemand_download_behavior_treat_error_as_warn: (false), + + background_task_maximum_delay: (humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), + + control_plane_api: (None), + control_plane_api_token: (None), + control_plane_emergency_mode: (false), + + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: None, + + max_vectored_read_bytes: (MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + image_compression: (DEFAULT_IMAGE_COMPRESSION), + ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: None, + compact_level0_phase1_value_access: Default::default(), + virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), + + io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, + + tenant_config: TenantConfigToml::default(), + } + } +} + +pub mod tenant_conf_defaults { + + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = + crate::models::CompactionAlgorithm::Legacy; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + + // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. + // If there's a need to decrease this value, first make sure that GC + // doesn't hold a layer map write lock for non-trivial operations. + // Relevant: https://github.com/neondatabase/neon/issues/3394 + pub const DEFAULT_GC_PERIOD: &str = "1 hr"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; +} + +impl Default for TenantConfigToml { + fn default() -> Self { + use tenant_conf_defaults::*; + Self { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: crate::models::CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), + eviction_policy: crate::models::EvictionPolicy::NoEviction, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::models::ThrottleConfig::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(), + lsn_lease_length: LsnLease::DEFAULT_LENGTH, + lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, + } + } +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a50707a1b8..40b7dbbbc2 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::time::{Duration, Instant}; @@ -8,6 +8,7 @@ use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; +use crate::models::PageserverUtilization; use crate::{ models::{ShardParameters, TenantConfig}, shard::{ShardStripeSize, TenantShardId}, @@ -55,6 +56,8 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + + pub availability_zone_id: String, } #[derive(Serialize, Deserialize)] @@ -71,6 +74,17 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsRequest { + #[serde(flatten)] + pub preferred_az_ids: HashMap, +} + +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsResponse { + pub updated: Vec, +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantLocateResponseShard { pub shard_id: TenantShardId, @@ -98,6 +112,21 @@ pub struct TenantDescribeResponse { pub config: TenantConfig, } +#[derive(Serialize, Deserialize, Debug)] +pub struct NodeShardResponse { + pub node_id: NodeId, + pub shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct NodeShard { + pub tenant_shard_id: TenantShardId, + /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node. + pub is_observed_secondary: Option, + /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node. + pub is_intended_secondary: Option, +} + #[derive(Serialize, Deserialize)] pub struct NodeDescribeResponse { pub id: NodeId, @@ -129,8 +158,12 @@ pub struct TenantDescribeResponseShard { pub is_splitting: bool, pub scheduling_policy: ShardSchedulingPolicy, + + pub preferred_az_id: Option, } +/// Migration request for a given tenant shard to a given node. +/// /// Explicitly migrating a particular shard is a low level operation /// TODO: higher level "Reschedule tenant" operation where the request /// specifies some constraints, e.g. asking it to get off particular node(s) @@ -140,23 +173,11 @@ pub struct TenantShardMigrateRequest { pub node_id: NodeId, } -/// Utilisation score indicating how good a candidate a pageserver -/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. -/// Lower values are better. -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] -pub struct UtilizationScore(pub u64); - -impl UtilizationScore { - pub fn worst() -> Self { - UtilizationScore(u64::MAX) - } -} - -#[derive(Serialize, Clone, Copy, Debug)] +#[derive(Serialize, Clone, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state - Active(UtilizationScore), + Active(PageserverUtilization), // Node is warming up, but we expect it to become available soon. Covers // the time span between the re-attach response being composed on the storage controller // and the first successful heartbeat after the processing of the re-attach response @@ -195,7 +216,9 @@ impl From for NodeAvailability { match val { // Assume the worst utilisation score to begin with. It will later be updated by // the heartbeats. - NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::Active => { + NodeAvailability::Active(PageserverUtilization::full()) + } NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()), NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 2fdd7de38f..4a776709c9 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::Oid; use postgres_ffi::RepOriginId; -use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -108,14 +108,41 @@ impl Key { } } + /// This function checks more extensively what keys we can take on the write path. + /// If a key beginning with 00 does not have a global/default tablespace OID, it + /// will be rejected on the write path. + #[allow(dead_code)] + pub fn is_valid_key_on_write_path_strong(&self) -> bool { + use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; + if !self.is_i128_representable() { + return false; + } + if self.field1 == 0 + && !(self.field2 == GLOBALTABLESPACE_OID + || self.field2 == DEFAULTTABLESPACE_OID + || self.field2 == 0) + { + return false; // User defined tablespaces are not supported + } + true + } + + /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply + /// checks if the key is i128 representable. Note that some keys can be successfully + /// ingested into the pageserver, but will cause errors on generating basebackup. + pub fn is_valid_key_on_write_path(&self) -> bool { + self.is_i128_representable() + } + + pub fn is_i128_representable(&self) -> bool { + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222 + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!( - self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222, - "invalid key: {self}", - ); + assert!(self.is_i128_representable(), "invalid key: {self}"); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) @@ -323,7 +350,17 @@ impl Key { // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID +// +// 02 00000000 00000000 00XXXXXX XX XXXXXXXX +// +// \______XID_________/ +// +// The 64-bit XID is stored a little awkwardly in field6, field5 and +// field4. PostgreSQL v16 and below only stored a 32-bit XID, which +// fit completely in field6, but starting with PostgreSQL v17, a full +// 64-bit XID is used. Most pageserver code that accesses +// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits +// are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 @@ -555,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key { }; #[inline(always)] -pub fn twophase_file_key(xid: TransactionId) -> Key { +pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] -pub fn twophase_key_range(xid: TransactionId) -> Range { +pub fn twophase_key_range(xid: u64) -> Range { + // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, + field3: u32::from(overflowed), + field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, + field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ab4adfbebe..45e84baa1f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -6,8 +6,9 @@ pub use utilization::PageserverUtilization; use std::{ collections::HashMap, + fmt::Display, io::{BufRead, Read}, - num::{NonZeroU64, NonZeroUsize}, + num::{NonZeroU32, NonZeroU64, NonZeroUsize}, str::FromStr, sync::atomic::AtomicUsize, time::{Duration, SystemTime}, @@ -61,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; serde::Serialize, serde::Deserialize, strum_macros::Display, - strum_macros::EnumVariantNames, + strum_macros::VariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] @@ -304,8 +305,10 @@ pub struct TenantConfig { pub lsn_lease_length_for_ts: Option, } -/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` -/// tenant config. When the first aux file written, the policy will be persisted in the +/// The policy for the aux file storage. +/// +/// It can be switched through `switch_aux_file_policy` tenant config. +/// When the first aux file written, the policy will be persisted in the /// `index_part.json` file and has a limited migration path. /// /// Currently, we only allow the following migration path: @@ -348,7 +351,7 @@ impl AuxFilePolicy { /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. pub fn default_tenant_config() -> Self { - Self::V1 + Self::V2 } } @@ -435,7 +438,9 @@ pub enum CompactionAlgorithm { Tiered, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, +)] pub enum ImageCompressionAlgorithm { // Disabled for writes, support decompressing during read path Disabled, @@ -470,11 +475,33 @@ impl FromStr for ImageCompressionAlgorithm { } } +impl Display for ImageCompressionAlgorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), + ImageCompressionAlgorithm::Zstd { level } => { + if let Some(level) = level { + write!(f, "zstd({})", level) + } else { + write!(f, "zstd") + } + } + } + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, } +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -486,12 +513,11 @@ pub struct EvictionPolicyLayerAccessThreshold { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ThrottleConfig { pub task_kinds: Vec, // TaskKind - pub initial: usize, + pub initial: u32, #[serde(with = "humantime_serde")] pub refill_interval: Duration, - pub refill_amount: NonZeroUsize, - pub max: usize, - pub fair: bool, + pub refill_amount: NonZeroU32, + pub max: u32, } impl ThrottleConfig { @@ -501,9 +527,8 @@ impl ThrottleConfig { // other values don't matter with emtpy `task_kinds`. initial: 0, refill_interval: Duration::from_millis(1), - refill_amount: NonZeroUsize::new(1).unwrap(), + refill_amount: NonZeroU32::new(1).unwrap(), max: 1, - fair: true, } } /// The requests per second allowed by the given config. @@ -721,8 +746,14 @@ pub struct TimelineInfo { pub walreceiver_status: String, + // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility. + // Backward compatibility: you will get a JSON not containing the newly-added field. + // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does + // not deny unknown fields by default so it's safe to set the field to some value, though it won't be + // read. /// The last aux file policy being used on this timeline pub last_aux_file_policy: Option, + pub is_archived: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -867,7 +898,9 @@ pub struct WalRedoManagerStatus { pub process: Option, } -/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating +/// The progress of a secondary tenant. +/// +/// It is mostly useful when doing a long running download: e.g. initiating /// a download job, timing out while waiting for it to run, and then inspecting this status to understand /// what's happening. #[derive(Default, Debug, Serialize, Deserialize, Clone)] @@ -1062,7 +1095,7 @@ impl TryFrom for PagestreamBeMessageTag { } } -// In the V2 protocol version, a GetPage request contains two LSN values: +// A GetPage request contains two LSN values: // // request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means // "get the latest version present". It's used by the primary server, which knows that no one else @@ -1075,7 +1108,7 @@ impl TryFrom for PagestreamBeMessageTag { // passing an earlier LSN can speed up the request, by allowing the pageserver to process the // request without waiting for 'request_lsn' to arrive. // -// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and // 'latest' was set to true. The V2 interface was added because there was no correct way for a // standby to request a page at a particular non-latest LSN, and also include the @@ -1083,15 +1116,11 @@ impl TryFrom for PagestreamBeMessageTag { // request, if the standby knows that the page hasn't been modified since, and risk getting an error // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 -// interface allows sending both LSNs, and let the pageserver do the right thing. There is no +// interface allows sending both LSNs, and let the pageserver do the right thing. There was no // difference in the responses between V1 and V2. // -// The Request structs below reflect the V2 interface. If V1 is used, the parse function -// maps the old format requests to the new format. -// #[derive(Clone, Copy)] pub enum PagestreamProtocolVersion { - V1, V2, } @@ -1230,36 +1259,17 @@ impl PagestreamFeMessage { bytes.into() } - pub fn parse( - body: &mut R, - protocol_version: PagestreamProtocolVersion, - ) -> anyhow::Result { + pub fn parse(body: &mut R) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; - let (request_lsn, not_modified_since) = match protocol_version { - PagestreamProtocolVersion::V2 => ( - Lsn::from(body.read_u64::()?), - Lsn::from(body.read_u64::()?), - ), - PagestreamProtocolVersion::V1 => { - // In the old protocol, each message starts with a boolean 'latest' flag, - // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and - // 'not_modified_since', used in the new protocol version. - let latest = body.read_u8()? != 0; - let request_lsn = Lsn::from(body.read_u64::()?); - if latest { - (Lsn::MAX, request_lsn) // get latest version - } else { - (request_lsn, request_lsn) // get version at specified LSN - } - } - }; + // these two fields are the same for every request type + let request_lsn = Lsn::from(body.read_u64::()?); + let not_modified_since = Lsn::from(body.read_u64::()?); - // The rest of the messages are the same between V1 and V2 match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { request_lsn, @@ -1467,9 +1477,7 @@ mod tests { ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = - PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2) - .unwrap(); + let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); assert!(msg == reconstructed); } } @@ -1677,21 +1685,33 @@ mod tests { #[test] fn test_image_compression_algorithm_parsing() { use ImageCompressionAlgorithm::*; - assert_eq!( - ImageCompressionAlgorithm::from_str("disabled").unwrap(), - Disabled - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd").unwrap(), - Zstd { level: None } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(), - Zstd { level: Some(18) } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(), - Zstd { level: Some(-3) } - ); + let cases = [ + ("disabled", Disabled), + ("zstd", Zstd { level: None }), + ("zstd(18)", Zstd { level: Some(18) }), + ("zstd(-3)", Zstd { level: Some(-3) }), + ]; + + for (display, expected) in cases { + assert_eq!( + ImageCompressionAlgorithm::from_str(display).unwrap(), + expected, + "parsing works" + ); + assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip"); + + let ser = serde_json::to_string(&expected).expect("serialization"); + assert_eq!( + serde_json::from_str::(&ser).unwrap(), + expected, + "serde roundtrip" + ); + + assert_eq!( + serde_json::Value::String(display.to_string()), + serde_json::to_value(expected).unwrap(), + "Display is the serde serialization" + ); + } } } diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 0fec221276..641aa51989 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -38,7 +38,7 @@ pub struct PageserverUtilization { pub max_shard_count: u32, /// Cached result of [`Self::score`] - pub utilization_score: u64, + pub utilization_score: Option, /// When was this snapshot captured, pageserver local time. /// @@ -50,6 +50,8 @@ fn unity_percent() -> Percent { Percent::new(0).unwrap() } +pub type RawScore = u64; + impl PageserverUtilization { const UTILIZATION_FULL: u64 = 1000000; @@ -62,7 +64,7 @@ impl PageserverUtilization { /// - Negative values are forbidden /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to /// layer eviction. - pub fn score(&self) -> u64 { + pub fn score(&self) -> RawScore { let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) * self.disk_usable_pct.get() as u64) / 100; @@ -74,8 +76,41 @@ impl PageserverUtilization { std::cmp::max(disk_utilization_score, shard_utilization_score) } - pub fn refresh_score(&mut self) { - self.utilization_score = self.score(); + pub fn cached_score(&mut self) -> RawScore { + match self.utilization_score { + None => { + let s = self.score(); + self.utilization_score = Some(s); + s + } + Some(s) => s, + } + } + + /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that + /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative. + /// + /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling + /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded. + pub fn is_overloaded(score: RawScore) -> bool { + // Why the factor of two? This is unscientific but reflects behavior of real systems: + // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep + // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed + // until some more nodes are deployed. + // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to + // hold its biggest timeline fully on disk, which is tends to be an over estimate when + // some tenants are very idle and have dropped layers from disk. In practice going up to + // double is generally better than giving up and scheduling in a sub-optimal AZ. + score >= 2 * Self::UTILIZATION_FULL + } + + pub fn adjust_shard_count_max(&mut self, shard_count: u32) { + if self.shard_count < shard_count { + self.shard_count = shard_count; + + // Dirty cache: this will be calculated next time someone retrives the score + self.utilization_score = None; + } } /// A utilization structure that has a full utilization score: use this as a placeholder when @@ -88,7 +123,38 @@ impl PageserverUtilization { disk_usable_pct: Percent::new(100).unwrap(), shard_count: 1, max_shard_count: 1, - utilization_score: Self::UTILIZATION_FULL, + utilization_score: Some(Self::UTILIZATION_FULL), + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } +} + +/// Test helper +pub mod test_utilization { + use super::PageserverUtilization; + use std::time::SystemTime; + use utils::{ + serde_percent::Percent, + serde_system_time::{self}, + }; + + // Parameters of the imaginary node used for test utilization instances + const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; + const TEST_SHARDS_MAX: u32 = 1000; + + /// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do + /// not abuse this function from non-test code. + /// + /// Emulates a node with a 1000 shard limit and a 1TB disk. + pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization { + PageserverUtilization { + disk_usage_bytes: disk_wanted_bytes, + free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE), + disk_wanted_bytes, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count, + max_shard_count: TEST_SHARDS_MAX, + utilization_score: None, captured_at: serde_system_time::SystemTime(SystemTime::now()), } } @@ -120,7 +186,7 @@ mod tests { disk_usage_bytes: u64::MAX, free_space_bytes: 0, disk_wanted_bytes: u64::MAX, - utilization_score: 13, + utilization_score: Some(13), disk_usable_pct: Percent::new(90).unwrap(), shard_count: 100, max_shard_count: 200, diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index f6854328fc..a0c87263ed 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -5,10 +5,8 @@ edition.workspace = true license.workspace = true [dependencies] -async-trait.workspace = true anyhow.workspace = true bytes.workspace = true -futures.workspace = true rustls.workspace = true serde.workspace = true thiserror.workspace = true diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 7c7c6535b3..8ea4b93fb1 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -69,8 +69,10 @@ impl QueryError { } /// Returns true if the given error is a normal consequence of a network issue, -/// or the client closing the connection. These errors can happen during normal -/// operations, and don't indicate a bug in our code. +/// or the client closing the connection. +/// +/// These errors can happen during normal operations, +/// and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( @@ -79,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { ) } -#[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). It will also flush out the output buffer. - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError>; + ) -> impl Future>; /// Called on startup packet receival, allows to process params. /// diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 7ec85f0dbe..900083ea7f 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) { struct TestHandler {} -#[async_trait::async_trait] impl Handler for TestHandler { // return single col 'hey' for any query async fn process_query( diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index 9f57f3d507..ddf9f7b610 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -7,6 +7,7 @@ use std::fmt; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. +/// /// The `host` part should be a correct `url::Host`, while `port` (if present) should be /// a valid decimal u16 of digits only. pub fn parse_host_port>(host_port: S) -> Result<(Host, Option), anyhow::Error> { diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index ee69878f69..ef17833a48 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -5,13 +5,10 @@ edition.workspace = true license.workspace = true [dependencies] -rand.workspace = true regex.workspace = true bytes.workspace = true -byteorder.workspace = true anyhow.workspace = true crc32c.workspace = true -hex.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 370d9e9a6f..d3a85f2683 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn include_file(&self, filename: &str) { // This does the equivalent of passing bindgen::CargoCallbacks // to the builder .parse_callbacks() method. - let cargo_callbacks = bindgen::CargoCallbacks; + let cargo_callbacks = bindgen::CargoCallbacks::new(); cargo_callbacks.include_file(filename) } @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15", "v16"] { + for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; @@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("XLogPageHeaderData") .allowlist_type("XLogLongPageHeaderData") .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_MAJORVERSION_NUM") .allowlist_var("PG_CONTROL_FILE_SIZE") .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0940ad207f..0d46ed6aac 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -44,6 +44,9 @@ macro_rules! postgres_ffi { // Re-export some symbols from bindings pub use bindings::DBState_DB_SHUTDOWNED; pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + + pub const ZERO_CHECKPOINT: bytes::Bytes = + bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); } }; } @@ -54,6 +57,7 @@ macro_rules! for_all_postgres_versions { $macro!(v14); $macro!(v15); $macro!(v16); + $macro!(v17); }; } @@ -88,6 +92,7 @@ macro_rules! dispatch_pgversion { 14 : v14, 15 : v15, 16 : v16, + 17 : v17, ] ) }; @@ -106,6 +111,110 @@ macro_rules! dispatch_pgversion { }; } +#[macro_export] +macro_rules! enum_pgversion_dispatch { + ($name:expr, $typ:ident, $bind:ident, $code:block) => { + enum_pgversion_dispatch!( + name = $name, + bind = $bind, + typ = $typ, + code = $code, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + V17 : v17, + ] + ) + }; + (name = $name:expr, + bind = $bind:ident, + typ = $typ:ident, + code = $code:block, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => { + match $name { + $( + self::$typ::$variant($bind) => { + use $crate::$md as pgv; + $code + } + ),+, + } + }; +} + +#[macro_export] +macro_rules! enum_pgversion { + {$name:ident, pgv :: $t:ident} => { + enum_pgversion!{ + name = $name, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + V17 : v17, + ] + } + }; + {$name:ident, pgv :: $p:ident :: $t:ident} => { + enum_pgversion!{ + name = $name, + path = $p, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + V17 : v17, + ] + } + }; + {name = $name:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ( $crate::$md::$t )),+ + } + impl self::$name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into for $crate::$md::$t { + fn into(self) -> self::$name { + self::$name::$variant (self) + } + } + )+ + }; + {name = $name:ident, + path = $p:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ($crate::$md::$p::$t)),+ + } + impl $name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into<$name> for $crate::$md::$p::$t { + fn into(self) -> $name { + $name::$variant (self) + } + } + )+ + }; +} + pub mod pg_constants; pub mod relfile_utils; @@ -136,9 +245,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::from_pg_timestamp; pub use v14::xlog_utils::get_current_timestamp; pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::try_from_pg_timestamp; pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6ce855c78e..61b49a634d 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From heapam_xlog.h +pub const XLOG_HEAP2_REWRITE: u8 = 0x00; + // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; @@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; +/* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_LONG_HEADER: u16 = 0x0002; +pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; +pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; +/* xlog_internal.h */ +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_LONG_HEADER: u16 = 0x0002; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 = /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; -// List of subdirectories inside pgdata. -// Copied from src/bin/initdb/initdb.c -pub const PGDATA_SUBDIRS: [&str; 22] = [ - "global", - "pg_wal/archive_status", - "pg_commit_ts", - "pg_dynshmem", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_replslot", - "pg_tblspc", - "pg_stat", - "pg_stat_tmp", - "pg_xact", - "pg_logical", - "pg_logical/snapshots", - "pg_logical/mappings", -]; - // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 32f8f51114..fe01a5df7c 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 22] = [ + "global", + "pg_wal/archive_status", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 626a23c7ea..3cd1b7aec5 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 587be71cb3..31bd5b68fd 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs new file mode 100644 index 0000000000..2132938680 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -0,0 +1,55 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 23] = [ + "global", + "pg_wal/archive_status", + "pg_wal/summaries", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} + + +pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; +pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; +pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; + + +pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; +pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 9fe7e8198b..1873734753 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -30,7 +30,7 @@ use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; use std::io::SeekFrom; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; @@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz { mod timestamp_conversions { use std::time::Duration; + use anyhow::Context; + use super::*; const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1) @@ -154,18 +156,18 @@ mod timestamp_conversions { } } - pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime { + pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result { let time: u64 = time .try_into() - .expect("timestamp before millenium (postgres epoch)"); + .context("timestamp before millenium (postgres epoch)")?; let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC; SystemTime::UNIX_EPOCH .checked_add(Duration::from_micros(since_unix_epoch)) - .expect("SystemTime overflow") + .context("SystemTime overflow") } } -pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp}; +pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp}; // Returns (aligned) end_lsn of the last record in data_dir with WAL segments. // start_lsn must point to some previously known record boundary (beginning of @@ -258,13 +260,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { } } -pub fn main() { - let mut data_dir = PathBuf::new(); - data_dir.push("."); - let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap(); - println!("wal_end={:?}", wal_end); -} - impl XLogRecord { pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; @@ -545,14 +540,14 @@ mod tests { #[test] fn test_ts_conversion() { let now = SystemTime::now(); - let round_trip = from_pg_timestamp(to_pg_timestamp(now)); + let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap(); let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap(); let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap(); assert_eq!(now_since.as_micros(), round_trip_since.as_micros()); let now_pg = get_current_timestamp(); - let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg)); + let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap()); assert_eq!(now_pg, round_trip_pg); } diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 29dd01a936..14c7d2e340 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -9,7 +9,6 @@ anyhow.workspace = true clap.workspace = true env_logger.workspace = true log.workspace = true -once_cell.workspace = true postgres.workspace = true postgres_ffi.workspace = true camino-tempfile.workspace = true diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6052f04d11..949e3f4251 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -53,7 +53,7 @@ impl Conf { #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 66bbe03ebc..9524a1490d 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -8,10 +8,8 @@ license.workspace = true bytes.workspace = true byteorder.workspace = true itertools.workspace = true -pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true tokio = { workspace = true, features = ["io-util"] } -tracing.workspace = true thiserror.workspace = true serde.workspace = true diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 02adee058f..f48f1801a4 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,14 +13,11 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true -aws-credential-types.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } -humantime.workspace = true humantime-serde.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true -rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index fa3f2cba58..d0e92411da 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -185,7 +185,7 @@ mod tests { use super::*; fn parse(input: &str) -> anyhow::Result { - let toml = input.parse::().unwrap(); + let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } @@ -235,6 +235,31 @@ timeout = '5s'"; ); } + #[test] + fn test_storage_class_serde_roundtrip() { + let classes = [ + None, + Some(StorageClass::Standard), + Some(StorageClass::IntelligentTiering), + ]; + for class in classes { + #[derive(Serialize, Deserialize)] + struct Wrapper { + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class" + )] + class: Option, + } + let wrapped = Wrapper { + class: class.clone(), + }; + let serialized = serde_json::to_string(&wrapped).unwrap(); + let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap(); + assert_eq!(class, deserialized.class); + } + } + #[test] fn test_azure_parsing() { let toml = "\ diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index cc1d3e0ae4..b5b69c9faf 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -45,6 +45,8 @@ pub use azure_core::Etag; pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +/// Default concurrency limit for S3 operations +/// /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// @@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static { ) -> Result<(), TimeTravelError>; } -/// DownloadStream is sensitive to the timeout and cancellation used with the original +/// Data part of an ongoing [`Download`]. +/// +/// `DownloadStream` is sensitive to the timeout and cancellation used with the original /// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible /// with `tokio::io::copy_buf`. // This has 'static because safekeepers do not use cancellation tokens (yet) diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index e1f4bcca46..14811232d3 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -6,6 +6,5 @@ license.workspace = true [dependencies] serde.workspace = true -serde_with.workspace = true const_format.workspace = true utils.workspace = true diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 2fbc333075..28666d197a 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -60,3 +60,16 @@ pub struct TimelineCopyRequest { pub target_timeline_id: TimelineId, pub until_lsn: Lsn, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineTermBumpRequest { + /// bump to + pub term: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineTermBumpResponse { + // before the request + pub previous_term: u64, + pub current_term: u64, +} diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index a3e12cf0e3..974a498404 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -5,9 +5,10 @@ mod calculation; pub mod svg; -/// StorageModel is the input to the synthetic size calculation. It represents -/// a tree of timelines, with just the information that's needed for the -/// calculation. This doesn't track timeline names or where each timeline +/// StorageModel is the input to the synthetic size calculation. +/// +/// It represents a tree of timelines, with just the information that's needed +/// for the calculation. This doesn't track timeline names or where each timeline /// begins and ends, for example. Instead, it consists of "points of interest" /// on the timelines. A point of interest could be the timeline start or end point, /// the oldest point on a timeline that needs to be retained because of PITR diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 5ea8db6b42..05eb538d42 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -9,8 +9,9 @@ hyper.workspace = true opentelemetry = { workspace = true, features=["rt-tokio"] } opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true -reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true -tracing-subscriber.workspace = true + +[dev-dependencies] +tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 6e593eeac1..f199b15554 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -14,7 +14,6 @@ testing = ["fail/failpoints"] arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true -async-trait.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true @@ -26,7 +25,6 @@ hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true -leaky-bucket.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true @@ -44,7 +42,6 @@ tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true -serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs deleted file mode 100644 index 0fb0190a92..0000000000 --- a/libs/utils/src/accum.rs +++ /dev/null @@ -1,33 +0,0 @@ -/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you -/// feed the accumulated values by calling the 'accum' function, instead of having an -/// iterator. -/// -/// For example, to calculate the smallest value among some integers: -/// -/// ``` -/// use utils::accum::Accum; -/// -/// let values = [1, 2, 3]; -/// -/// let mut min_value: Accum = Accum(None); -/// for new_value in &values { -/// min_value.accum(std::cmp::min, *new_value); -/// } -/// -/// assert_eq!(min_value.0.unwrap(), 1); -/// ``` -pub struct Accum(pub Option); -impl Accum { - pub fn accum(&mut self, func: F, new_value: T) - where - F: FnOnce(T, T) -> T, - { - // If there is no previous value, just store the new value. - // Otherwise call the function to decide which one to keep. - self.0 = Some(if let Some(accum) = self.0 { - func(accum, new_value) - } else { - new_value - }); - } -} diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs index 720ea39d4f..e1ddfd8650 100644 --- a/libs/utils/src/circuit_breaker.rs +++ b/libs/utils/src/circuit_breaker.rs @@ -5,8 +5,10 @@ use std::{ use metrics::IntCounter; -/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly, -/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and +/// Circuit breakers are for operations that are expensive and fallible. +/// +/// If a circuit breaker fails repeatedly, we will stop attempting it for some +/// period of time, to avoid denial-of-service from retries, and /// to mitigate the log spam from repeated failures. pub struct CircuitBreaker { /// An identifier that enables us to log useful errors when a circuit is broken diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 756b19138c..b97c6c7a45 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,3 +1,4 @@ +use std::os::fd::AsRawFd; use std::{ borrow::Cow, fs::{self, File}, @@ -203,6 +204,27 @@ pub fn overwrite( Ok(()) } +/// Syncs the filesystem for the given file descriptor. +#[cfg_attr(target_os = "macos", allow(unused_variables))] +pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { + // Linux guarantees durability for syncfs. + // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). + #[cfg(target_os = "linux")] + { + use anyhow::Context; + nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + } + #[cfg(target_os = "macos")] + { + // macOS is not a production platform for Neon, don't even bother. + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + compile_error!("Unsupported OS"); + } + Ok(()) +} + #[cfg(test)] mod tests { diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index db468e3054..eb91839504 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id { } impl Id { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { - let mut arr = [0u8; 16]; - buf.copy_to_slice(&mut arr); - Id::from(arr) - } - pub fn from_slice(src: &[u8]) -> Result { if src.len() != 16 { return Err(IdError::SliceParseError(src.len())); @@ -179,10 +173,6 @@ impl fmt::Debug for Id { macro_rules! id_newtype { ($t:ident) => { impl $t { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t { - $t(Id::get_from_buf(buf)) - } - pub fn from_slice(src: &[u8]) -> Result<$t, IdError> { Ok($t(Id::from_slice(src)?)) } @@ -249,8 +239,10 @@ macro_rules! id_newtype { }; } -/// Neon timeline IDs are different from PostgreSQL timeline -/// IDs. They serve a similar purpose though: they differentiate +/// Neon timeline ID. +/// +/// They are different from PostgreSQL timeline +/// IDs, but serve a similar purpose: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only /// 32-bits wide, and they must be in ascending order in any given diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs new file mode 100644 index 0000000000..a120dc0ac5 --- /dev/null +++ b/libs/utils/src/leaky_bucket.rs @@ -0,0 +1,280 @@ +//! This module implements the Generic Cell Rate Algorithm for a simplified +//! version of the Leaky Bucket rate limiting system. +//! +//! # Leaky Bucket +//! +//! If the bucket is full, no new requests are allowed and are throttled/errored. +//! If the bucket is partially full/empty, new requests are added to the bucket in +//! terms of "tokens". +//! +//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate. +//! +//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second. +//! +//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm) +//! +//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires +//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time. +//! +//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach +//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`. +//! +//! Another explaination can be found here: + +use std::{sync::Mutex, time::Duration}; + +use tokio::{sync::Notify, time::Instant}; + +pub struct LeakyBucketConfig { + /// This is the "time cost" of a single request unit. + /// Should loosely represent how long it takes to handle a request unit in active resource time. + /// Loosely speaking this is the inverse of the steady-rate requests-per-second + pub cost: Duration, + + /// total size of the bucket + pub bucket_width: Duration, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, bucket_size: f64) -> Self { + let cost = Duration::from_secs_f64(rps.recip()); + let bucket_width = cost.mul_f64(bucket_size); + Self { cost, bucket_width } + } +} + +pub struct LeakyBucketState { + /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`. + /// + /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost". + /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`. + /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens. + /// Draining the bucket will happen naturally as `now` moves forward. + /// + /// Let `n` be some "time cost" for the request, + /// If now is after empty_at, the bucket is empty and the empty_at is reset to now, + /// If now is within the `bucket window + n`, we are within time budget. + /// If now is before the `bucket window + n`, we have run out of budget. + /// + /// This is inspired by the generic cell rate algorithm (GCRA) and works + /// exactly the same as a leaky-bucket. + pub empty_at: Instant, +} + +impl LeakyBucketState { + pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self { + LeakyBucketState { + empty_at: Instant::now() + config.cost.mul_f64(initial_tokens), + } + } + + pub fn bucket_is_empty(&self, now: Instant) -> bool { + // if self.end is after now, the bucket is not empty + self.empty_at <= now + } + + /// Immediately adds tokens to the bucket, if there is space. + /// + /// In a scenario where you are waiting for available rate, + /// rather than just erroring immediately, `started` corresponds to when this waiting started. + /// + /// `n` is the number of tokens that will be filled in the bucket. + /// + /// # Errors + /// + /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when + /// there will be space again. + pub fn add_tokens( + &mut self, + config: &LeakyBucketConfig, + started: Instant, + n: f64, + ) -> Result<(), Instant> { + let now = Instant::now(); + + // invariant: started <= now + debug_assert!(started <= now); + + // If the bucket was empty when we started our search, + // we should update the `empty_at` value accordingly. + // this prevents us from having negative tokens in the bucket. + let mut empty_at = self.empty_at; + if empty_at < started { + empty_at = started; + } + + let n = config.cost.mul_f64(n); + let new_empty_at = empty_at + n; + let allow_at = new_empty_at.checked_sub(config.bucket_width); + + // empty_at + // allow_at | new_empty_at + // / | / + // -------o-[---------o-|--]--------- + // now1 ^ now2 ^ + // + // at now1, the bucket would be completely filled if we add n tokens. + // at now2, the bucket would be partially filled if we add n tokens. + + match allow_at { + Some(allow_at) if now < allow_at => Err(allow_at), + _ => { + self.empty_at = new_empty_at; + Ok(()) + } + } + } +} + +pub struct RateLimiter { + pub config: LeakyBucketConfig, + pub state: Mutex, + /// a queue to provide this fair ordering. + pub queue: Notify, +} + +struct Requeue<'a>(&'a Notify); + +impl Drop for Requeue<'_> { + fn drop(&mut self) { + self.0.notify_one(); + } +} + +impl RateLimiter { + pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self { + RateLimiter { + state: Mutex::new(LeakyBucketState::with_initial_tokens( + &config, + initial_tokens, + )), + config, + queue: { + let queue = Notify::new(); + queue.notify_one(); + queue + }, + } + } + + pub fn steady_rps(&self) -> f64 { + self.config.cost.as_secs_f64().recip() + } + + /// returns true if we did throttle + pub async fn acquire(&self, count: usize) -> bool { + let mut throttled = false; + + let start = tokio::time::Instant::now(); + + // wait until we are the first in the queue + let mut notified = std::pin::pin!(self.queue.notified()); + if !notified.as_mut().enable() { + throttled = true; + notified.await; + } + + // notify the next waiter in the queue when we are done. + let _guard = Requeue(&self.queue); + + loop { + let res = self + .state + .lock() + .unwrap() + .add_tokens(&self.config, start, count as f64); + match res { + Ok(()) => return throttled, + Err(ready_at) => { + throttled = true; + tokio::time::sleep_until(ready_at).await; + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let config = LeakyBucketConfig { + // average 100rps + cost: Duration::from_millis(10), + // burst up to 100 requests + bucket_width: Duration::from_millis(1000), + }; + + let mut state = LeakyBucketState { + empty_at: Instant::now(), + }; + + // supports burst + { + // should work for 100 requests this instant + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // doesn't overfill + { + // after 1s we should have an empty bucket again. + tokio::time::advance(Duration::from_secs(1)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // after 1s more, we should not over count the tokens and allow more than 200 requests. + tokio::time::advance(Duration::from_secs(1)).await; + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // supports sustained rate over a long period + { + tokio::time::advance(Duration::from_secs(1)).await; + + // should sustain 100rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + } + + // supports requesting more tokens than can be stored in the bucket + // we just wait a little bit longer upfront. + { + // start the bucket completely empty + tokio::time::advance(Duration::from_secs(5)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // requesting 200 tokens of space should take 200*cost = 2s + // but we already have 1s available, so we wait 1s from start. + let start = Instant::now(); + + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_secs(1)); + + tokio::time::advance(Duration::from_millis(500)).await; + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(500)); + + tokio::time::advance(Duration::from_millis(500)).await; + state.add_tokens(&config, start, 200.0).unwrap(); + + // bucket should be completely full now + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index f4fc0ba57b..03fb36caf8 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -43,16 +43,9 @@ pub mod logging; pub mod lock_file; pub mod pid_file; -// Misc -pub mod accum; -pub mod shutdown; - // Utility for binding TcpListeners with proper socket options. pub mod tcp_listener; -// Utility for putting a raw file descriptor into non-blocking mode -pub mod nonblock; - // Default signal handling pub mod sentry_init; pub mod signals; @@ -71,6 +64,7 @@ pub mod postgres_client; pub mod tracing_span_assert; +pub mod leaky_bucket; pub mod rate_limit; /// Simple once-barrier and a guard which keeps barrier awaiting. diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 59c66ca757..3a2ed3e830 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -100,7 +100,9 @@ pub enum LockFileRead { } /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to -/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked. +/// inspect its content. +/// +/// It is not an `Err(...)` if the file does not exist or is already locked. /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f7b73dc984..e205d60d74 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -3,9 +3,9 @@ use std::str::FromStr; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; -use strum_macros::{EnumString, EnumVariantNames}; +use strum_macros::{EnumString, VariantNames}; -#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, @@ -188,7 +188,7 @@ impl Drop for TracingPanicHookGuard { } /// Named symbol for our panic hook, which logs the panic. -fn tracing_panic_hook(info: &std::panic::PanicInfo) { +fn tracing_panic_hook(info: &std::panic::PanicHookInfo) { // following rust 1.66.1 std implementation: // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 let location = info.location(); @@ -274,6 +274,14 @@ impl From for SecretString { } } +impl FromStr for SecretString { + type Err = std::convert::Infallible; + + fn from_str(s: &str) -> Result { + Ok(Self(s.to_string())) + } +} + impl std::fmt::Debug for SecretString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[SECRET]") diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 1aebe91428..06d5c27ebf 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -1,6 +1,5 @@ #![warn(missing_docs)] -use camino::Utf8Path; use serde::{de::Visitor, Deserialize, Serialize}; use std::fmt; use std::ops::{Add, AddAssign}; @@ -145,14 +144,6 @@ impl Lsn { i128::from(self.0) - i128::from(other) } - /// Parse an LSN from a filename in the form `0000000000000000` - pub fn from_filename(filename: F) -> Result - where - F: AsRef, - { - Lsn::from_hex(filename.as_ref().as_str()) - } - /// Parse an LSN from a string in the form `0000000000000000` pub fn from_hex(s: S) -> Result where diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs deleted file mode 100644 index 05e2e3af4c..0000000000 --- a/libs/utils/src/nonblock.rs +++ /dev/null @@ -1,17 +0,0 @@ -use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL}; -use std::os::unix::io::RawFd; - -/// Put a file descriptor into non-blocking mode -pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> { - let bits = fcntl(fd, F_GETFL)?; - - // If F_GETFL returns some unknown bits, they should be valid - // for passing back to F_SETFL, too. If we left them out, the F_SETFL - // would effectively clear them, which is not what we want. - let mut flags = OFlag::from_bits_retain(bits); - flags |= OFlag::O_NONBLOCK; - - fcntl(fd, F_SETFL(flags))?; - - Ok(()) -} diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index 3ddfa44f41..dede65e699 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -8,6 +8,7 @@ use tracing::{trace, warn}; use crate::lsn::Lsn; /// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// /// Serialized in custom flexible key/value format. In replication protocol, it /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres /// Standby status update / Hot standby feedback messages. diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index 27378c69fc..c3e2fba20c 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -65,6 +65,8 @@ impl Poison { } } +/// Armed pointer to a [`Poison`]. +/// /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. /// Once modifications are done, use [`Self::disarm`]. /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 557955bb88..f3f8f219e3 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -5,6 +5,15 @@ use std::time::{Duration, Instant}; pub struct RateLimit { last: Option, interval: Duration, + dropped: u64, +} + +pub struct RateLimitStats(u64); + +impl std::fmt::Display for RateLimitStats { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{} dropped calls", self.0) + } } impl RateLimit { @@ -12,20 +21,27 @@ impl RateLimit { Self { last: None, interval, + dropped: 0, } } /// Call `f` if the rate limit allows. /// Don't call it otherwise. pub fn call(&mut self, f: F) { + self.call2(|_| f()) + } + + pub fn call2(&mut self, f: F) { let now = Instant::now(); match self.last { Some(last) if now - last <= self.interval => { // ratelimit + self.dropped += 1; } _ => { self.last = Some(now); - f(); + f(RateLimitStats(self.dropped)); + self.dropped = 0; } } } diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index f6b430657e..d146010b41 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(pub u8); -/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, -/// when we need to know which shard we're dealing with, but do not need to know the full -/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know -/// the fully qualified TenantShardId. +/// Combination of ShardNumber and ShardCount. +/// +/// For use within the context of a particular tenant, when we need to know which shard we're +/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing +/// any page->shard mapping), and do not need to know the fully qualified TenantShardId. #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct ShardIndex { pub shard_number: ShardNumber, diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs deleted file mode 100644 index cb5a44d664..0000000000 --- a/libs/utils/src/shutdown.rs +++ /dev/null @@ -1,7 +0,0 @@ -/// Immediately terminate the calling process without calling -/// atexit callbacks, C runtime destructors etc. We mainly use -/// this to protect coverage data from concurrent writes. -pub fn exit_now(code: u8) -> ! { - // SAFETY: exiting is safe, the ffi is not safe - unsafe { nix::libc::_exit(code as _) }; -} diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index ecc5353be3..01750b2aef 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard}; use tokio::sync::watch; -/// /// Rcu allows multiple readers to read and hold onto a value without blocking -/// (for very long). Storing to the Rcu updates the value, making new readers -/// immediately see the new value, but it also waits for all current readers to -/// finish. +/// (for very long). /// +/// Storing to the Rcu updates the value, making new readers immediately see +/// the new value, but it also waits for all current readers to finish. pub struct Rcu { inner: RwLock>, } diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 1abd3d9861..dc711fb028 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -5,7 +5,9 @@ use std::sync::{ use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of -/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard +/// `SemaphorePermit`. +/// +/// Allows use of `take` which does not require holding an outer mutex guard /// for the duration of initialization. /// /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`]. diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs index ab5f7bdd95..1359e27b77 100644 --- a/libs/utils/src/toml_edit_ext.rs +++ b/libs/utils/src/toml_edit_ext.rs @@ -10,7 +10,7 @@ pub fn deserialize_item(item: &toml_edit::Item) -> Result where T: serde::de::DeserializeOwned, { - let document: toml_edit::Document = match item { + let document: toml_edit::DocumentMut = match item { toml_edit::Item::Table(toml) => toml.clone().into(), toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { toml.clone().into_table().into() diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 18b2af14f1..5f0028bacd 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -7,6 +7,7 @@ pub enum VecMapOrdering { } /// Ordered map datastructure implemented in a Vec. +/// /// Append only - can only add keys that are larger than the /// current max key. /// Ordering can be adjusted using [`VecMapOrdering`] diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs index 963279eb4c..68274f0631 100644 --- a/libs/utils/src/yielding_loop.rs +++ b/libs/utils/src/yielding_loop.rs @@ -6,9 +6,10 @@ pub enum YieldingLoopError { Cancelled, } -/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically -/// yields to avoid blocking the executor, and after resuming checks the provided -/// cancellation token to drop out promptly on shutdown. +/// Helper for long synchronous loops, e.g. over all tenants in the system. +/// +/// Periodically yields to avoid blocking the executor, and after resuming +/// checks the provided cancellation token to drop out promptly on shutdown. #[inline(always)] pub async fn yielding_loop( interval: usize, @@ -23,7 +24,7 @@ where for (i, item) in iter.enumerate() { visitor(item); - if i + 1 % interval == 0 { + if (i + 1) % interval == 0 { tokio::task::yield_now().await; if cancel.is_cancelled() { return Err(YieldingLoopError::Cancelled); diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index 46e9f880a1..ba73902d38 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -15,13 +15,11 @@ anyhow.workspace = true axum.workspace = true clap.workspace = true futures.workspace = true -inotify.workspace = true serde.workspace = true serde_json.workspace = true sysinfo.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } tokio-postgres.workspace = true -tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 3126b170a4..3f549889b8 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -4,7 +4,8 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; -use bindgen::CargoCallbacks; + +const WALPROPOSER_PG_VERSION: &str = "v17"; fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes @@ -37,7 +38,10 @@ fn main() -> anyhow::Result<()> { // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); - let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); + let pg_config_bin = pg_install_abs + .join(WALPROPOSER_PG_VERSION) + .join("bin") + .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") @@ -54,7 +58,7 @@ fn main() -> anyhow::Result<()> { .into() } else { let server_path = pg_install_abs - .join("v16") + .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") @@ -64,16 +68,25 @@ fn main() -> anyhow::Result<()> { .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; + let unwind_abi_functions = [ + "log_internal", + "recovery_download", + "start_streaming", + "finish_sync_safekeepers", + "wait_event_set", + "WalProposerStart", + ]; + // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. - let bindings = bindgen::Builder::default() + let mut builder = bindgen::Builder::default() // The input header we would like to generate // bindings for. .header("bindgen_deps.h") // Tell cargo to invalidate the built crate whenever any of the // included header files changed. - .parse_callbacks(Box::new(CargoCallbacks)) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .allowlist_type("WalProposer") .allowlist_type("WalProposerConfig") .allowlist_type("walproposer_api") @@ -95,6 +108,7 @@ fn main() -> anyhow::Result<()> { .allowlist_var("ERROR") .allowlist_var("FATAL") .allowlist_var("PANIC") + .allowlist_var("PG_VERSION_NUM") .allowlist_var("WPEVENT") .allowlist_var("WL_LATCH_SET") .allowlist_var("WL_SOCKET_READABLE") @@ -104,7 +118,12 @@ fn main() -> anyhow::Result<()> { .allowlist_var("WL_SOCKET_MASK") .clang_arg("-DWALPROPOSER_LIB") .clang_arg(format!("-I{pgxn_neon}")) - .clang_arg(format!("-I{inc_server_path}")) + .clang_arg(format!("-I{inc_server_path}")); + + for name in unwind_abi_functions { + builder = builder.override_abi(bindgen::Abi::CUnwind, name); + } + let bindings = builder // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index bbc3663402..2fbea3fe45 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat } } -extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { +extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; @@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { +extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; @@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { } } -extern "C" fn wait_event_set( +extern "C-unwind" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, event_sk: *mut *mut Safekeeper, @@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr { } } -extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { +extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; @@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee } } -extern "C" fn log_internal( +extern "C-unwind" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, line: *const ::std::os::raw::c_char, diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 37b1e0fa87..ba75171db2 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -282,7 +282,11 @@ mod tests { use std::cell::UnsafeCell; use utils::id::TenantTimelineId; - use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; + use crate::{ + api_bindings::Level, + bindings::{NeonWALReadResult, PG_VERSION_NUM}, + walproposer::Wrapper, + }; use super::ApiImpl; @@ -489,41 +493,79 @@ mod tests { let (sender, receiver) = sync_channel(1); + // Messages definitions are at walproposer.h + // xxx: it would be better to extract them from safekeeper crate and + // use serialization/deserialization here. + let greeting_tag = (b'g' as u64).to_ne_bytes(); + let proto_version = 2_u32.to_ne_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); + let proposer_id = [0; 16]; + let system_id = 0_u64.to_ne_bytes(); + let tenant_id = ttid.tenant_id.as_arr(); + let timeline_id = ttid.timeline_id.as_arr(); + let pg_tli = 1_u32.to_ne_bytes(); + let wal_seg_size = 16777216_u32.to_ne_bytes(); + let proposer_greeting = [ + greeting_tag.as_slice(), + proto_version.as_slice(), + pg_version.as_slice(), + proposer_id.as_slice(), + system_id.as_slice(), + tenant_id.as_slice(), + timeline_id.as_slice(), + pg_tli.as_slice(), + wal_seg_size.as_slice(), + ] + .concat(); + + let voting_tag = (b'v' as u64).to_ne_bytes(); + let vote_request_term = 3_u64.to_ne_bytes(); + let proposer_id = [0; 16]; + let vote_request = [ + voting_tag.as_slice(), + vote_request_term.as_slice(), + proposer_id.as_slice(), + ] + .concat(); + + let acceptor_greeting_term = 2_u64.to_ne_bytes(); + let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting = [ + greeting_tag.as_slice(), + acceptor_greeting_term.as_slice(), + acceptor_greeting_node_id.as_slice(), + ] + .concat(); + + let vote_response_term = 3_u64.to_ne_bytes(); + let vote_given = 1_u64.to_ne_bytes(); + let flush_lsn = 0x539_u64.to_ne_bytes(); + let truncate_lsn = 0x539_u64.to_ne_bytes(); + let th_len = 1_u32.to_ne_bytes(); + let th_term = 2_u64.to_ne_bytes(); + let th_lsn = 0x539_u64.to_ne_bytes(); + let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response = [ + voting_tag.as_slice(), + vote_response_term.as_slice(), + vote_given.as_slice(), + flush_lsn.as_slice(), + truncate_lsn.as_slice(), + th_len.as_slice(), + th_term.as_slice(), + th_lsn.as_slice(), + timeline_start_lsn.as_slice(), + ] + .concat(); + let my_impl: Box = Box::new(MockImpl { wait_events: Cell::new(WaitEventsData { sk: std::ptr::null_mut(), event_mask: 0, }), - expected_messages: vec![ - // TODO: When updating Postgres versions, this test will cause - // problems. Postgres version in message needs updating. - // - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) - vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, - 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, - 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, - ], - // VoteRequest(VoteRequest { term: 3 }) - vec![ - 118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ], - ], + expected_messages: vec![proposer_greeting, vote_request], expected_ptr: AtomicUsize::new(0), - safekeeper_replies: vec![ - // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) }) - vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - ], - // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 }) - vec![ - 118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57, - 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, - 0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, - ], - ], + safekeeper_replies: vec![acceptor_greeting, vote_response], replies_ptr: AtomicUsize::new(0), sync_channel: sender, shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0e748ee3db..0eb48d6823 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,26 +8,23 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] +testing = ["fail/failpoints", "pageserver_api/testing" ] [dependencies] anyhow.workspace = true arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true +bit_field.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true -crossbeam-utils.workspace = true either.workspace = true -flate2.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true @@ -36,7 +33,6 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true -leaky-bucket.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses @@ -52,14 +48,11 @@ rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true scopeguard.workspace = true +send-future.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true serde_with.workspace = true -signal-hook.workspace = true -smallvec = { workspace = true, features = ["write"] } -svg_fmt.workspace = true -sync_wrapper.workspace = true sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true @@ -72,7 +65,6 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true -twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true @@ -100,6 +92,7 @@ procfs.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } +indoc.workspace = true [[bench]] name = "bench_layer_map" diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 0336302de0..72cbb6beab 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -10,6 +10,7 @@ use pageserver::{ page_cache, repository::Value, task_mgr::TaskKind, + tenant::storage_layer::inmemory_layer::SerializedBatch, tenant::storage_layer::InMemoryLayer, virtual_file, }; @@ -67,12 +68,16 @@ async fn ingest( let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; - let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?; + let data = Value::Image(Bytes::from(vec![0u8; put_size])); + let data_ser_size = data.serialized_size().unwrap() as usize; let ctx = RequestContext::new( pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler, pageserver::context::DownloadBehavior::Download, ); + const BATCH_SIZE: usize = 16; + let mut batch = Vec::new(); + for i in 0..put_count { lsn += put_size as u64; @@ -95,7 +100,17 @@ async fn ingest( } } - layer.put_value(key.to_compact(), lsn, &data, &ctx).await?; + batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); + if batch.len() >= BATCH_SIZE { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); + layer.put_batch(serialized, &ctx).await?; + } + } + if !batch.is_empty() { + let this_batch = std::mem::take(&mut batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); + layer.put_batch(serialized, &ctx).await?; } layer.freeze(lsn + 1).await; @@ -149,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init(16384, virtual_file::io_engine_for_bench()); + virtual_file::init( + 16384, + virtual_file::io_engine_for_bench(), + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index a938367334..d9b36bf3d4 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] pageserver_api.workspace = true thiserror.workspace = true -async-trait.workspace = true reqwest = { workspace = true, features = [ "stream" ] } utils.workspace = true serde.workspace = true diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs index 4a3f4dea47..cc8db37173 100644 --- a/pageserver/client/src/lib.rs +++ b/pageserver/client/src/lib.rs @@ -1,2 +1,20 @@ pub mod mgmt_api; pub mod page_service; + +/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool. +// If file structure is per-kind not per-feature then where to put this? +#[derive(Clone, Copy)] +pub enum BlockUnblock { + Block, + Unblock, +} + +impl std::fmt::Display for BlockUnblock { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + BlockUnblock::Block => "block", + BlockUnblock::Unblock => "unblock", + }; + f.write_str(s) + } +} diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index ac3ff1bb89..a68f45a6d9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -12,6 +12,8 @@ use utils::{ pub use reqwest::Body as ReqwestBody; +use crate::BlockUnblock; + pub mod util; #[derive(Debug, Clone)] @@ -419,6 +421,24 @@ impl Client { } } + pub async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config", + self.mgmt_api_endpoint + ); + + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_detach_ancestor( &self, tenant_shard_id: TenantShardId, @@ -436,6 +456,20 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn timeline_block_unblock_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc", + self.mgmt_api_endpoint, + ); + + self.request(Method::POST, &uri, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", @@ -506,6 +540,16 @@ impl Client { .map_err(Error::ReceiveBody) } + /// Configs io buffer alignment at runtime. + pub async fn put_io_alignment(&self, align: usize) -> Result<()> { + let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, align) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn get_utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); self.get(uri) diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 0fd1d81845..52b58fc298 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -9,41 +9,19 @@ default = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true async-stream.workspace = true -byteorder.workspace = true -bytes.workspace = true -chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true -consumption_metrics.workspace = true -crossbeam-utils.workspace = true -either.workspace = true -flate2.workspace = true -fail.workspace = true futures.workspace = true git-version.workspace = true -hex.workspace = true -humantime.workspace = true -humantime-serde.workspace = true itertools.workspace = true once_cell.workspace = true pageserver_api.workspace = true pin-project-lite.workspace = true rand.workspace = true -smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true -sync_wrapper.workspace = true -thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } -tokio-io-timeout.workspace = true -tokio-util.workspace = true tracing.workspace = true -tracing-error.workspace = true tracing-subscriber.workspace = true -url.workspace = true -walkdir.workspace = true -metrics.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index be5626040b..9592002de1 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true [dependencies] anyhow.workspace = true -bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true @@ -24,5 +23,4 @@ toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true -serde.workspace = true serde_json.workspace = true diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index b4bb239f44..adc090823d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option { return None; } let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); - let is_delta = if lsns.len() == 1 { - lsns.push(lsns[0]); + let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect(); + let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect(); + let the_lsns: [&str; 2]; + + /* + * Generations add a -vX-XXXXXX postfix, which causes issues when we try to + * parse 'vX' as an LSN. + */ + let is_delta = if lsns.len() == 1 || lsns[1].is_empty() { + the_lsns = [lsns[0], lsns[0]]; false } else { + the_lsns = [lsns[0], lsns[1]]; true }; let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); - let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap(); let holes = Vec::new(); Some(LayerFile { key_range, @@ -144,7 +152,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 3611b0baab..dd753398e2 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,7 +59,7 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -89,6 +89,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); + assert!(k.is_i128_representable(), "invalid key: "); } // TODO(chi): special handling for last key? Ok(()) @@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3fabf62987..cf001ef0d5 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -26,7 +26,7 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; @@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> { println!("specified prefix '{}' failed validation", cmd.prefix); return Ok(()); }; - let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?; let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); @@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + DEFAULT_IO_BUFFER_ALIGNMENT, + ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 4992f37465..ac4a732377 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -58,6 +58,11 @@ pub(crate) struct Args { /// [`pageserver_api::models::virtual_file::IoEngineKind`]. #[clap(long)] set_io_engine: Option, + + /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers. + #[clap(long)] + set_io_alignment: Option, + targets: Option>, } @@ -124,6 +129,10 @@ async fn main_impl( mgmt_api_client.put_io_engine(engine_str).await?; } + if let Some(align) = args.set_io_alignment { + mgmt_api_client.put_io_alignment(align).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs new file mode 100644 index 0000000000..66ca7fd057 --- /dev/null +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -0,0 +1,39 @@ +//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case. + +pub(crate) const _ASSERT_U64_EQ_USIZE: () = { + if std::mem::size_of::() != std::mem::size_of::() { + panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + } +}; + +pub(crate) trait U64IsUsize { + fn into_usize(self) -> usize; +} + +impl U64IsUsize for u64 { + #[inline(always)] + fn into_usize(self) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as usize + } +} + +pub(crate) trait UsizeIsU64 { + fn into_u64(self) -> u64; +} + +impl UsizeIsU64 for usize { + #[inline(always)] + fn into_u64(self) -> u64 { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as u64 + } +} + +pub const fn u64_to_usize(x: u64) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + x as usize +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 207f781e1b..a32d09f3b3 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; @@ -255,8 +254,11 @@ where let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + let pgversion = self.timeline.pg_version; + let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); + // Create pgdata subdirs structure - for dir in PGDATA_SUBDIRS.iter() { + for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, &mut io::empty()) @@ -606,7 +608,7 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { + async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) @@ -617,7 +619,11 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = format!("pg_twophase/{:>08X}", xid); + let path = if self.timeline.pg_version < 17 { + format!("pg_twophase/{:>08X}", xid) + } else { + format!("pg_twophase/{:>016X}", xid) + }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index da0c11d9bf..d15a0e47a4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -5,6 +5,7 @@ use std::env; use std::env::{var, VarError}; use std::io::Read; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -36,6 +37,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::crashsafe::syncfs; use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; use utils::{ @@ -124,19 +126,53 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); - info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); + info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); + // The tenants directory contains all the pageserver local disk state. + // Create if not exists and make sure all the contents are durable before proceeding. + // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown. + // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not. + // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error. let tenants_path = conf.tenants_path(); - if !tenants_path.exists() { - utils::crashsafe::create_dir_all(conf.tenants_path()) - .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?; + { + let open = || { + nix::dir::Dir::open( + tenants_path.as_std_path(), + nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY, + nix::sys::stat::Mode::empty(), + ) + }; + let dirfd = match open() { + Ok(dirfd) => dirfd, + Err(e) => match e { + nix::errno::Errno::ENOENT => { + utils::crashsafe::create_dir_all(&tenants_path).with_context(|| { + format!("Failed to create tenants root dir at '{tenants_path}'") + })?; + open().context("open tenants dir after creating it")? + } + e => anyhow::bail!(e), + }, + }; + + let started = Instant::now(); + syncfs(dirfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "made tenant directory contents durable" + ); } // Initialize up failpoints support let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); + virtual_file::init( + conf.max_file_descriptors, + conf.virtual_file_io_engine, + conf.io_buffer_alignment, + ); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; @@ -172,27 +208,15 @@ fn initialize_config( } }; - let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { - Ok(mut f) => { - let md = f.metadata().context("stat config file")?; - if md.is_file() { - let mut s = String::new(); - f.read_to_string(&mut s).context("read config file")?; - s.parse().context("parse config file toml")? - } else { - anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); - } - } - Err(e) => { - anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); - } - }; - - debug!("Using pageserver toml: {config}"); - - // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) - .context("Failed to parse pageserver configuration")?; + let config_file_contents = + std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; + let config_toml = serde_path_to_error::deserialize( + toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?, + ) + .context("deserialize config toml")?; + let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) + .context("runtime-validation of config toml")?; Ok(Box::leak(Box::new(conf))) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0ebaf78840..e9f197ec2d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,11 +4,13 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; +use anyhow::{bail, ensure, Context}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::{ + config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, + shard::TenantShardId, +}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde::de::IntoDeserializer; -use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -17,10 +19,8 @@ use utils::logging::SecretString; use once_cell::sync::OnceCell; use reqwest::Url; use std::num::NonZeroUsize; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; @@ -29,136 +29,27 @@ use utils::{ logging::LogFormat, }; -use crate::l0_flush::L0FlushConfig; -use crate::tenant::config::TenantConfOpt; -use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; -use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; -use crate::{tenant::config::TenantConf, virtual_file}; +use crate::virtual_file; +use crate::virtual_file::io_engine; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; -use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; - -use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE; - -pub mod defaults { - use crate::tenant::config::defaults::*; - use const_format::formatcp; - - pub use pageserver_api::config::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, - DEFAULT_PG_LISTEN_PORT, - }; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; - pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - - pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; - - pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; - pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; - - pub const DEFAULT_LOG_FORMAT: &str = "plain"; - - pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; - - pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = - super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); - - pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; - pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; - - pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; - pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; - - #[cfg(target_os = "linux")] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring"; - - #[cfg(not(target_os = "linux"))] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - - pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - - pub const DEFAULT_GET_IMPL: &str = "vectored"; - - pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB - - pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)"; - - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; - - pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - - /// - /// Default built-in configuration file. - /// - pub const DEFAULT_CONFIG_FILE: &str = formatcp!( - r#" -# Initial configuration file created by 'pageserver --init' -#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' -#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' - -#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' -#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' - -#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} -#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} - -# initial superuser role name to use when creating a new tenant -#initial_superuser_name = '{DEFAULT_SUPERUSER}' - -#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}' - -#log_format = '{DEFAULT_LOG_FORMAT}' - -#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' -#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' - -#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - -#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} - -#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' - -#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} - -#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' - -#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' - -[tenant_config] -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} -#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes -#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} -#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} -#pitr_interval = '{DEFAULT_PITR_INTERVAL}' - -#min_resident_size_override = .. # in bytes -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' - -#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} -#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} - -#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} - -#[remote_storage] - -"# - ); -} - +/// Global state of pageserver. +/// +/// It's mostly immutable configuration, but some semaphores and the +/// like crept in over time and the name stuck. +/// +/// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] +/// and passing that to [`PageServerConf::parse_and_validate`]. +/// +/// # Adding a New Field +/// +/// 1. Add the field to `pageserver_api::config::ConfigToml`. +/// 2. Fix compiler errors (exhaustive destructuring will guide you). +/// +/// For fields that require additional validation or filling in of defaults at runtime, +/// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers @@ -204,7 +95,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, - pub default_tenant_conf: TenantConf, + pub default_tenant_conf: crate::tenant::config::TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, @@ -281,16 +172,16 @@ pub struct PageServerConf { /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, - pub l0_flush: L0FlushConfig, - - /// This flag is temporary and will be removed after gradual rollout. - /// See . - pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + pub l0_flush: crate::l0_flush::L0FlushConfig, /// Direct IO settings pub virtual_file_direct_io: virtual_file::DirectIoMode, + + pub io_buffer_alignment: usize, } +/// Token for authentication to safekeepers +/// /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in @@ -299,464 +190,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -// use dedicated enum for builder to better indicate the intention -// and avoid possible confusion with nested options -#[derive(Clone, Default)] -pub enum BuilderValue { - Set(T), - #[default] - NotSet, -} - -impl BuilderValue { - pub fn ok_or(&self, field_name: &'static str, default: BuilderValue) -> anyhow::Result { - match self { - Self::Set(v) => Ok(v.clone()), - Self::NotSet => match default { - BuilderValue::Set(v) => Ok(v.clone()), - BuilderValue::NotSet => { - anyhow::bail!("missing config value {field_name:?}") - } - }, - } - } -} - -// needed to simplify config construction -#[derive(Default)] -struct PageServerConfigBuilder { - listen_pg_addr: BuilderValue, - - listen_http_addr: BuilderValue, - - availability_zone: BuilderValue>, - - wait_lsn_timeout: BuilderValue, - wal_redo_timeout: BuilderValue, - - superuser: BuilderValue, - - page_cache_size: BuilderValue, - max_file_descriptors: BuilderValue, - - workdir: BuilderValue, - - pg_distrib_dir: BuilderValue, - - http_auth_type: BuilderValue, - pg_auth_type: BuilderValue, - - // - auth_validation_public_key_path: BuilderValue>, - remote_storage_config: BuilderValue>, - - broker_endpoint: BuilderValue, - broker_keepalive_interval: BuilderValue, - - log_format: BuilderValue, - - concurrent_tenant_warmup: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, - - metric_collection_interval: BuilderValue, - metric_collection_endpoint: BuilderValue>, - synthetic_size_calculation_interval: BuilderValue, - metric_collection_bucket: BuilderValue>, - - disk_usage_based_eviction: BuilderValue>, - - test_remote_failures: BuilderValue, - - ondemand_download_behavior_treat_error_as_warn: BuilderValue, - - background_task_maximum_delay: BuilderValue, - - control_plane_api: BuilderValue>, - control_plane_api_token: BuilderValue>, - control_plane_emergency_mode: BuilderValue, - - heatmap_upload_concurrency: BuilderValue, - secondary_download_concurrency: BuilderValue, - - ingest_batch_size: BuilderValue, - - virtual_file_io_engine: BuilderValue, - - max_vectored_read_bytes: BuilderValue, - - image_compression: BuilderValue, - - ephemeral_bytes_per_memory_kb: BuilderValue, - - l0_flush: BuilderValue, - - compact_level0_phase1_value_access: BuilderValue, - - virtual_file_direct_io: BuilderValue, -} - -impl PageServerConfigBuilder { - fn new() -> Self { - Self::default() - } - - #[inline(always)] - fn default_values() -> Self { - use self::BuilderValue::*; - use defaults::*; - Self { - listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), - listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), - availability_zone: Set(None), - wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) - .expect("cannot parse default wait lsn timeout")), - wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) - .expect("cannot parse default wal redo timeout")), - superuser: Set(DEFAULT_SUPERUSER.to_string()), - page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), - max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), - workdir: Set(Utf8PathBuf::new()), - pg_distrib_dir: Set(Utf8PathBuf::from_path_buf( - env::current_dir().expect("cannot access current directory"), - ) - .expect("non-Unicode path") - .join("pg_install")), - http_auth_type: Set(AuthType::Trust), - pg_auth_type: Set(AuthType::Trust), - auth_validation_public_key_path: Set(None), - remote_storage_config: Set(None), - broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT - .parse() - .expect("failed to parse default broker endpoint")), - broker_keepalive_interval: Set(humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL, - ) - .expect("cannot parse default keepalive interval")), - log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - - concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant")), - concurrent_tenant_size_logical_size_queries: Set( - ConfigurableSemaphore::DEFAULT_INITIAL, - ), - metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default metric collection interval")), - synthetic_size_calculation_interval: Set(humantime::parse_duration( - DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, - ) - .expect("cannot parse default synthetic size calculation interval")), - metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - - metric_collection_bucket: Set(None), - - disk_usage_based_eviction: Set(None), - - test_remote_failures: Set(0), - - ondemand_download_behavior_treat_error_as_warn: Set(false), - - background_task_maximum_delay: Set(humantime::parse_duration( - DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, - ) - .unwrap()), - - control_plane_api: Set(None), - control_plane_api_token: Set(None), - control_plane_emergency_mode: Set(false), - - heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), - secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), - - ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), - - virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - - max_vectored_read_bytes: Set(MaxVectoredReadBytes( - NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), - )), - image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()), - ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - l0_flush: Set(L0FlushConfig::default()), - compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), - virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), - } - } -} - -impl PageServerConfigBuilder { - pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { - self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) - } - - pub fn listen_http_addr(&mut self, listen_http_addr: String) { - self.listen_http_addr = BuilderValue::Set(listen_http_addr) - } - - pub fn availability_zone(&mut self, availability_zone: Option) { - self.availability_zone = BuilderValue::Set(availability_zone) - } - - pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { - self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) - } - - pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { - self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) - } - - pub fn superuser(&mut self, superuser: String) { - self.superuser = BuilderValue::Set(superuser) - } - - pub fn page_cache_size(&mut self, page_cache_size: usize) { - self.page_cache_size = BuilderValue::Set(page_cache_size) - } - - pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { - self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) - } - - pub fn workdir(&mut self, workdir: Utf8PathBuf) { - self.workdir = BuilderValue::Set(workdir) - } - - pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) { - self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) - } - - pub fn http_auth_type(&mut self, auth_type: AuthType) { - self.http_auth_type = BuilderValue::Set(auth_type) - } - - pub fn pg_auth_type(&mut self, auth_type: AuthType) { - self.pg_auth_type = BuilderValue::Set(auth_type) - } - - pub fn auth_validation_public_key_path( - &mut self, - auth_validation_public_key_path: Option, - ) { - self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) - } - - pub fn remote_storage_config(&mut self, remote_storage_config: Option) { - self.remote_storage_config = BuilderValue::Set(remote_storage_config) - } - - pub fn broker_endpoint(&mut self, broker_endpoint: Uri) { - self.broker_endpoint = BuilderValue::Set(broker_endpoint) - } - - pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { - self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) - } - - pub fn log_format(&mut self, log_format: LogFormat) { - self.log_format = BuilderValue::Set(log_format) - } - - pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_warmup = BuilderValue::Set(u); - } - - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); - } - - pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { - self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) - } - - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { - self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) - } - - pub fn metric_collection_bucket( - &mut self, - metric_collection_bucket: Option, - ) { - self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) - } - - pub fn synthetic_size_calculation_interval( - &mut self, - synthetic_size_calculation_interval: Duration, - ) { - self.synthetic_size_calculation_interval = - BuilderValue::Set(synthetic_size_calculation_interval) - } - - pub fn test_remote_failures(&mut self, fail_first: u64) { - self.test_remote_failures = BuilderValue::Set(fail_first); - } - - pub fn disk_usage_based_eviction(&mut self, value: Option) { - self.disk_usage_based_eviction = BuilderValue::Set(value); - } - - pub fn ondemand_download_behavior_treat_error_as_warn( - &mut self, - ondemand_download_behavior_treat_error_as_warn: bool, - ) { - self.ondemand_download_behavior_treat_error_as_warn = - BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); - } - - pub fn background_task_maximum_delay(&mut self, delay: Duration) { - self.background_task_maximum_delay = BuilderValue::Set(delay); - } - - pub fn control_plane_api(&mut self, api: Option) { - self.control_plane_api = BuilderValue::Set(api) - } - - pub fn control_plane_api_token(&mut self, token: Option) { - self.control_plane_api_token = BuilderValue::Set(token) - } - - pub fn control_plane_emergency_mode(&mut self, enabled: bool) { - self.control_plane_emergency_mode = BuilderValue::Set(enabled) - } - - pub fn heatmap_upload_concurrency(&mut self, value: usize) { - self.heatmap_upload_concurrency = BuilderValue::Set(value) - } - - pub fn secondary_download_concurrency(&mut self, value: usize) { - self.secondary_download_concurrency = BuilderValue::Set(value) - } - - pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { - self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) - } - - pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { - self.virtual_file_io_engine = BuilderValue::Set(value); - } - - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { - self.max_vectored_read_bytes = BuilderValue::Set(value); - } - - pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { - self.image_compression = BuilderValue::Set(value); - } - - pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { - self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); - } - - pub fn l0_flush(&mut self, value: L0FlushConfig) { - self.l0_flush = BuilderValue::Set(value); - } - - pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) { - self.compact_level0_phase1_value_access = BuilderValue::Set(value); - } - - pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { - self.virtual_file_direct_io = BuilderValue::Set(value); - } - - pub fn build(self, id: NodeId) -> anyhow::Result { - let default = Self::default_values(); - - macro_rules! conf { - (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => { - PageServerConf { - $( - $field: self.$field.ok_or(stringify!($field), default.$field)?, - )* - $( - $custom_field: $custom_value, - )* - } - }; - } - - Ok(conf!( - USING DEFAULT - { - listen_pg_addr, - listen_http_addr, - availability_zone, - wait_lsn_timeout, - wal_redo_timeout, - superuser, - page_cache_size, - max_file_descriptors, - workdir, - pg_distrib_dir, - http_auth_type, - pg_auth_type, - auth_validation_public_key_path, - remote_storage_config, - broker_endpoint, - broker_keepalive_interval, - log_format, - metric_collection_interval, - metric_collection_endpoint, - metric_collection_bucket, - synthetic_size_calculation_interval, - disk_usage_based_eviction, - test_remote_failures, - ondemand_download_behavior_treat_error_as_warn, - background_task_maximum_delay, - control_plane_api, - control_plane_api_token, - control_plane_emergency_mode, - heatmap_upload_concurrency, - secondary_download_concurrency, - ingest_batch_size, - max_vectored_read_bytes, - image_compression, - ephemeral_bytes_per_memory_kb, - l0_flush, - compact_level0_phase1_value_access, - virtual_file_direct_io, - } - CUSTOM LOGIC - { - id: id, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - concurrent_tenant_warmup: ConfigurableSemaphore::new({ - self - .concurrent_tenant_warmup - .ok_or("concurrent_tenant_warmpup", - default.concurrent_tenant_warmup)? - }), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - self - .concurrent_tenant_size_logical_size_queries - .ok_or("concurrent_tenant_size_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())? - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - // re-use `concurrent_tenant_size_logical_size_queries` - self - .concurrent_tenant_size_logical_size_queries - .ok_or("eviction_task_immitated_concurrent_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())?, - ), - virtual_file_io_engine: match self.virtual_file_io_engine { - BuilderValue::Set(v) => v, - BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? { - io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise - io_engine::FeatureTestResult::Worse { engine, remark } => { - // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); - engine - } - }, - }, - } - )) - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -848,7 +281,7 @@ impl PageServerConf { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } @@ -865,131 +298,134 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate( - node_id: NodeId, - toml: &Document, + id: NodeId, + config_toml: pageserver_api::config::ConfigToml, workdir: &Utf8Path, ) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::new(); - builder.workdir(workdir.to_owned()); + let pageserver_api::config::ConfigToml { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access: _, + l0_flush, + virtual_file_direct_io, + concurrent_tenant_warmup, + concurrent_tenant_size_logical_size_queries, + virtual_file_io_engine, + io_buffer_alignment, + tenant_config, + } = config_toml; - let mut t_conf = TenantConfOpt::default(); + let mut conf = PageServerConf { + // ------------------------------------------------------------ + // fields that are already fully validated by the ConfigToml Deserialize impl + // ------------------------------------------------------------ + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config: remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + virtual_file_direct_io, + io_buffer_alignment, - for (key, item) in toml.iter() { - match key { - "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), - "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), - "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), - "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), - "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), - "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), - "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), - "max_file_descriptors" => { - builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) - } - "pg_distrib_dir" => { - builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( - Utf8PathBuf::from(parse_toml_string(key, item)?), - )), - "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), - "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), - "remote_storage" => { - builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) - } - "tenant_config" => { - t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; - } - "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), - "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), - "log_format" => builder.log_format( - LogFormat::from_config(&parse_toml_string(key, item)?)? - ), - "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "metric_collection_endpoint" => { - let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; - builder.metric_collection_endpoint(Some(endpoint)); - }, - "metric_collection_bucket" => { - builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) - } - "synthetic_size_calculation_interval" => - builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), - "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "disk_usage_based_eviction" => { - tracing::info!("disk_usage_based_eviction: {:#?}", &item); - builder.disk_usage_based_eviction( - deserialize_from_item("disk_usage_based_eviction", item) - .context("parse disk_usage_based_eviction")? - ) - }, - "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), - "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api(None) - } else { - builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + // ------------------------------------------------------------ + // fields that require additional validation or custom handling + // ------------------------------------------------------------ + workdir: workdir.to_owned(), + pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { + std::env::current_dir() + .expect("current_dir() failed") + .try_into() + .expect("current_dir() is not a valid Utf8Path") + }), + control_plane_api_token: control_plane_api_token.map(SecretString::from), + id, + default_tenant_conf: tenant_config, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + concurrent_tenant_size_logical_size_queries, + ), + virtual_file_io_engine: match virtual_file_io_engine { + Some(v) => v, + None => match crate::virtual_file::io_engine_feature_test() + .context("auto-detect virtual_file_io_engine")? + { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine } }, - "control_plane_api_token" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api_token(None) - } else { - builder.control_plane_api_token(Some(parsed.into())) - } - }, - "control_plane_emergency_mode" => { - builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - }, - "heatmap_upload_concurrency" => { - builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) - }, - "secondary_download_concurrency" => { - builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) - }, - "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), - "virtual_file_io_engine" => { - builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) - } - "max_vectored_read_bytes" => { - let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; - builder.get_max_vectored_read_bytes( - MaxVectoredReadBytes( - NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) - } - "image_compression" => { - builder.get_image_compression(parse_toml_from_str("image_compression", item)?) - } - "ephemeral_bytes_per_memory_kb" => { - builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) - } - "l0_flush" => { - builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) - } - "compact_level0_phase1_value_access" => { - builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) - } - "virtual_file_direct_io" => { - builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) - } - _ => bail!("unrecognized pageserver option '{key}'"), - } - } + }, + l0_flush: l0_flush + .map(crate::l0_flush::L0FlushConfig::from) + .unwrap_or_default(), + }; - let mut conf = builder.build(node_id).context("invalid config")?; + // ------------------------------------------------------------ + // custom validation code that covers more than one field in isolation + // ------------------------------------------------------------ if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1003,7 +439,14 @@ impl PageServerConf { ); } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); + IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) + .map_err(anyhow::Error::msg) + .with_context(|| { + format!( + "effective checkpoint distance is unsupported: {}", + conf.default_tenant_conf.checkpoint_distance + ) + })?; Ok(conf) } @@ -1017,129 +460,25 @@ impl PageServerConf { pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); - PageServerConf { - id: NodeId(0), + let config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - superuser: "cloud_admin".to_string(), - workdir: repo_dir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5000), - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant"), - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( - ), + pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant"), - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - } + ..Default::default() + }; + PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } -#[derive(Deserialize)] +#[derive(serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } -// Helper functions to parse a toml Item - -fn parse_toml_string(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - -fn parse_toml_u64(name: &str, item: &Item) -> Result { - // A toml integer is signed, so it cannot represent the full range of an u64. That's OK - // for our use, though. - let i: i64 = item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?; - if i < 0 { - bail!("configure option {name} cannot be negative"); - } - Ok(i as u64) -} - -fn parse_toml_bool(name: &str, item: &Item) -> Result { - item.as_bool() - .with_context(|| format!("configure option {name} is not a bool")) -} - -fn parse_toml_duration(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - - Ok(humantime::parse_duration(s)?) -} - -fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result -where - T: FromStr, - ::Err: std::fmt::Display, -{ - let v = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v).map_err(|e| { - anyhow!( - "Failed to parse string as {parse_type} for configure option {name}: {e}", - parse_type = stringify!(T) - ) - }) -} - -fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result -where - T: serde::de::DeserializeOwned, -{ - // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way - let deserializer = match item.clone().into_value() { - Ok(value) => value.into_deserializer(), - Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), - }; - T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) -} - /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -1201,467 +540,109 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{fs, num::NonZeroU32}; - use camino_tempfile::{tempdir, Utf8TempDir}; - use pageserver_api::models::EvictionPolicy; - use remote_storage::{RemoteStorageKind, S3Config}; - use utils::serde_percent::Percent; + use camino::Utf8PathBuf; + use utils::id::NodeId; - use super::*; - use crate::DEFAULT_PG_VERSION; - - const ALL_BASE_VALUES_TOML: &str = r#" -# Initial configuration file created by 'pageserver --init' - -listen_pg_addr = '127.0.0.1:64000' -listen_http_addr = '127.0.0.1:9898' - -wait_lsn_timeout = '111 s' -wal_redo_timeout = '111 s' - -page_cache_size = 444 -max_file_descriptors = 333 - -# initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zzzz' - -metric_collection_interval = '222 s' -metric_collection_endpoint = 'http://localhost:80/metrics' -synthetic_size_calculation_interval = '333 s' - -log_format = 'json' -background_task_maximum_delay = '334 s' - -"#; + use super::PageServerConf; #[test] - fn parse_defaults() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - // we have to create dummy values to overcome the validation errors - let config_string = - format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, - superuser: defaults::DEFAULT_SUPERUSER.to_string(), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL - )?, - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_METRIC_COLLECTION_INTERVAL - )?, - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, - synthetic_size_calculation_interval: humantime::parse_duration( - defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL - )?, - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: humantime::parse_duration( - defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY - )?, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - }, - "Correct defaults should be used when no config values are provided" - ); - - Ok(()) - } - - #[test] - fn parse_basic_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - - let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'", - ); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: "127.0.0.1:64000".to_string(), - listen_http_addr: "127.0.0.1:9898".to_string(), - availability_zone: None, - wait_lsn_timeout: Duration::from_secs(111), - wal_redo_timeout: Duration::from_secs(111), - superuser: "zzzz".to_string(), - page_cache_size: 444, - max_file_descriptors: 333, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5), - log_format: LogFormat::Json, - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: Duration::from_secs(222), - metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), - metric_collection_bucket: None, - synthetic_size_calculation_interval: Duration::from_secs(333), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: Duration::from_secs(334), - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: 100, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - }, - "Should be able to parse all basic config values correctly" - ); - - Ok(()) - } - - #[test] - fn parse_remote_fs_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; - - let local_storage_path = tempdir.path().join("local_remote_storage"); - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -local_path = '{local_storage_path}'"#, - ), - format!("remote_storage={{local_path='{local_storage_path}'}}"), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() }, - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); - } - Ok(()) - } - - #[test] - fn parse_remote_s3_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let bucket_name = "some-sample-bucket".to_string(); - let bucket_region = "eu-north-1".to_string(); - let prefix_in_bucket = "test_prefix".to_string(); - let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); - let max_sync_errors = NonZeroU32::new(222).unwrap(); - let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); - let broker_endpoint = "http://127.0.0.1:7777"; - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -max_concurrent_syncs = {max_concurrent_syncs} -max_sync_errors = {max_sync_errors} -bucket_name = '{bucket_name}' -bucket_region = '{bucket_region}' -prefix_in_bucket = '{prefix_in_bucket}' -endpoint = '{endpoint}' -concurrency_limit = {s3_concurrency_limit}"# - ), - format!( - "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", - ), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(S3Config { - bucket_name: bucket_name.clone(), - bucket_region: bucket_region.clone(), - prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()), - concurrency_limit: s3_concurrency_limit, - max_keys_per_list_response: None, - upload_storage_class: None, - }), - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the S3 config" - ); - } - Ok(()) - } - - #[test] - fn parse_incorrect_tenant_config() -> anyhow::Result<()> { - let config_string = r#" - [tenant_config] - checkpoint_distance = -1 # supposed to be an u64 - "# - .to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err(); - - let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64"; - assert_eq!(error.to_string(), expected_error_str); - - Ok(()) - } - - #[test] - fn parse_override_tenant_config() -> anyhow::Result<()> { - let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let conf = TenantConfOpt::try_from(item.to_owned()).unwrap(); - - assert_eq!(conf.min_resident_size_override, Some(400)); - - Ok(()) - } - - #[test] - fn eviction_pageserver_config_parse() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[disk_usage_based_eviction] -max_usage_pct = 80 -min_avail_bytes = 0 -period = "10s" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "LayerAccessThreshold" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; - - assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); - assert_eq!( - conf.metric_collection_endpoint, - Some("http://sample.url".parse().unwrap()) - ); - assert_eq!( - conf.metric_collection_interval, - Duration::from_secs(10 * 60) - ); - assert_eq!( - conf.default_tenant_conf - .evictions_low_residence_duration_metric_threshold, - Duration::from_secs(20 * 60) - ); - - // Assert that the node id provided by the indentity file (threaded - // through the call to [`PageServerConf::parse_and_validate`] is - // used. - assert_eq!(conf.id, NodeId(333)); - assert_eq!( - conf.disk_usage_based_eviction, - Some(DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 0, - period: Duration::from_secs(10), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: Default::default(), - }) - ); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::LayerAccessThreshold(eviction_threshold) => { - assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - - Ok(()) - } - - #[test] - fn parse_imitation_only_pageserver_config() { - let tempdir = tempdir().unwrap(); - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap(); - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "OnlyImitiate" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::OnlyImitiate(t) => { - assert_eq!(t.period, Duration::from_secs(20 * 60)); - assert_eq!(t.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - } - - #[test] - fn empty_remote_storage_is_error() { - let tempdir = tempdir().unwrap(); - let (workdir, _) = prepare_fs(&tempdir).unwrap(); + fn test_empty_config_toml_is_valid() { + // we use Default impl of everything in this situation let input = r#" -remote_storage = {} "#; - let doc = toml_edit::Document::from_str(input).unwrap(); - let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) - .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); - assert!(format!("{err}").contains("remote_storage"), "{err}"); + let config_toml = toml_edit::de::from_str::(input) + .expect("empty config is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); } - fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { - let tempdir_path = tempdir.path(); + #[test] + fn test_compactl0_phase1_access_mode_is_ignored_silently() { + let input = indoc::indoc! {r#" + [compact_level0_phase1_value_access] + mode = "streaming-kmerge" + validate = "key-lsn-value" + "#}; + toml_edit::de::from_str::(input).unwrap(); + } - let workdir = tempdir_path.join("workdir"); - fs::create_dir_all(&workdir)?; + /// If there's a typo in the pageserver config, we'd rather catch that typo + /// and fail pageserver startup than silently ignoring the typo, leaving whoever + /// made it in the believe that their config change is effective. + /// + /// The default in serde is to allow unknown fields, so, we rely + /// on developer+review discipline to add `deny_unknown_fields` when adding + /// new structs to the config, and these tests here as a regression test. + /// + /// The alternative to all of this would be to allow unknown fields in the config. + /// To catch them, we could have a config check tool or mgmt API endpoint that + /// compares the effective config with the TOML on disk and makes sure that + /// the on-disk TOML is a strict subset of the effective config. + mod unknown_fields_handling { + macro_rules! test { + ($short_name:ident, $input:expr) => { + #[test] + fn $short_name() { + let input = $input; + let err = toml_edit::de::from_str::(&input) + .expect_err("some_invalid_field is an invalid field"); + dbg!(&err); + assert!(err.to_string().contains("some_invalid_field")); + } + }; + } + use indoc::indoc; - let pg_distrib_dir = tempdir_path.join("pg_distrib"); - let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir_versioned)?; - let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); - fs::create_dir_all(&postgres_bin_dir)?; - fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; + test!( + toplevel, + indoc! {r#" + some_invalid_field = 23 + "#} + ); - Ok((workdir, pg_distrib_dir)) + test!( + toplevel_nested, + indoc! {r#" + [some_invalid_field] + foo = 23 + "#} + ); + + test!( + disk_usage_based_eviction, + indoc! {r#" + [disk_usage_based_eviction] + some_invalid_field = 23 + "#} + ); + + test!( + tenant_config, + indoc! {r#" + [tenant_config] + some_invalid_field = 23 + "#} + ); + + test!( + l0_flush, + indoc! {r#" + [l0_flush] + mode = "direct" + some_invalid_field = 23 + "#} + ); + + // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 + // test!( + // remote_storage_config, + // indoc! {r#" + // [remote_storage_config] + // local_path = "/nonexistent" + // some_invalid_field = 23 + // "#} + // ); } } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f94d945d46..0c7630edca 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,6 +1,8 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::size::CalculateSyntheticSizeError; @@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; +use itertools::Itertools as _; use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; @@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; -mod metrics; -use crate::consumption_metrics::metrics::MetricsKey; mod disk_cache; +mod metrics; mod upload; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); @@ -143,6 +145,12 @@ async fn collect_metrics( // these are point in time, with variable "now" let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; + // Pre-generate event idempotency keys, to reuse them across the bucket + // and HTTP sinks. + let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate()) + .take(metrics.len()) + .collect_vec(); + let metrics = Arc::new(metrics); // why not race cancellation here? because we are one of the last tasks, and if we are @@ -161,10 +169,16 @@ async fn collect_metrics( } if let Some(bucket_client) = &bucket_client { - let res = - upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await; + let res = upload::upload_metrics_bucket( + bucket_client, + &cancel, + &node_id, + &metrics, + &idempotency_keys, + ) + .await; if let Err(e) = res { - tracing::error!("failed to upload to S3: {e:#}"); + tracing::error!("failed to upload to remote storage: {e:#}"); } } }; @@ -174,9 +188,9 @@ async fn collect_metrics( &client, metric_collection_endpoint, &cancel, - &node_id, &metrics, &mut cached_metrics, + &idempotency_keys, ) .await; if let Err(e) = res { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 4e8283c3e4..0325ee403a 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, - node_id: &str, metrics: &[RawMetric], cached_metrics: &mut Cache, + idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { let mut uploaded = 0; let mut failed = 0; let started_at = std::time::Instant::now(); - let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id); + let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys); while let Some(res) = iter.next() { let (chunk, body) = res?; @@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket( cancel: &CancellationToken, node_id: &str, metrics: &[RawMetric], + idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { if metrics.is_empty() { // Skip uploads if we have no metrics, so that readers don't have to handle the edge case @@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket( // Serialize and write into compressed buffer let started_at = std::time::Instant::now(); - for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) { + for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) { let (_chunk, body) = res?; gzip_writer.write_all(&body).await?; } @@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket( Ok(()) } -// The return type is quite ugly, but we gain testability in isolation -fn serialize_in_chunks<'a, F>( +/// Serializes the input metrics as JSON in chunks of chunk_size. The provided +/// idempotency keys are injected into the corresponding metric events (reused +/// across different metrics sinks), and must have the same length as input. +fn serialize_in_chunks<'a>( chunk_size: usize, input: &'a [RawMetric], - factory: F, + idempotency_keys: &'a [IdempotencyKey<'a>], ) -> impl ExactSizeIterator> + 'a -where - F: KeyGen<'a> + 'a, { use bytes::BufMut; - struct Iter<'a, F> { + assert_eq!(input.len(), idempotency_keys.len()); + + struct Iter<'a> { inner: std::slice::Chunks<'a, RawMetric>, + idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, chunk_size: usize, // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries buffer: bytes::BytesMut, // chunk amount of events are reused to produce the serialized document scratch: Vec>, - factory: F, } - impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> { + impl<'a> Iterator for Iter<'a> { type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>; fn next(&mut self) -> Option { @@ -167,17 +170,14 @@ where self.scratch.extend( chunk .iter() - .map(|raw_metric| raw_metric.as_event(&self.factory.generate())), + .zip(&mut self.idempotency_keys) + .map(|(raw_metric, key)| raw_metric.as_event(key)), ); } else { // next rounds: update_in_place to reuse allocations assert_eq!(self.scratch.len(), self.chunk_size); - self.scratch - .iter_mut() - .zip(chunk.iter()) - .for_each(|(slot, raw_metric)| { - raw_metric.update_in_place(slot, &self.factory.generate()) - }); + itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) + .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); } let res = serde_json::to_writer( @@ -198,18 +198,19 @@ where } } - impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {} + impl<'a> ExactSizeIterator for Iter<'a> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); + let idempotency_keys = idempotency_keys.iter(); let scratch = Vec::new(); Iter { inner, + idempotency_keys, chunk_size, buffer, scratch, - factory, } } @@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric { } } -trait KeyGen<'a>: Copy { +pub(crate) trait KeyGen<'a> { fn generate(&self) -> IdempotencyKey<'a>; } @@ -389,7 +390,10 @@ mod tests { let examples = metric_samples(); assert!(examples.len() > 1); - let factory = FixedGen::new(Utc::now(), "1", 42); + let now = Utc::now(); + let idempotency_keys = (0..examples.len()) + .map(|i| FixedGen::new(now, "1", i as u16).generate()) + .collect::>(); // need to use Event here because serde_json::Value uses default hashmap, not linked // hashmap @@ -398,13 +402,13 @@ mod tests { events: Vec>, } - let correct = serialize_in_chunks(examples.len(), &examples, factory) + let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); for chunk_size in 1..examples.len() { - let actual = serialize_in_chunks(chunk_size, &examples, factory) + let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 0b07e07524..7afcf52cf2 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -1,7 +1,9 @@ -//! This module defines `RequestContext`, a structure that we use throughout -//! the pageserver to propagate high-level context from places -//! that _originate_ activity down to the shared code paths at the -//! heart of the pageserver. It's inspired by Golang's `context.Context`. +//! Defines [`RequestContext`]. +//! +//! It is a structure that we use throughout the pageserver to propagate +//! high-level context from places that _originate_ activity down to the +//! shared code paths at the heart of the pageserver. It's inspired by +//! Golang's `context.Context`. //! //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions: //! 1. What high-level activity ([`TaskKind`]) needs this page? @@ -105,8 +107,10 @@ pub struct RequestContext { #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)] pub enum PageContentKind { Unknown, + DeltaLayerSummary, DeltaLayerBtreeNode, DeltaLayerValue, + ImageLayerSummary, ImageLayerBtreeNode, ImageLayerValue, InMemoryLayer, diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index b5d9267d79..f6d1c35a8c 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -141,12 +141,32 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { m.other ); + let az_id = { + let az_id_from_metadata = m + .other + .get("availability_zone_id") + .and_then(|jv| jv.as_str().map(|str| str.to_owned())); + + match az_id_from_metadata { + Some(az_id) => Some(az_id), + None => { + tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + conf.availability_zone.clone() + } + } + }; + + if az_id.is_none() { + panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + } + Some(NodeRegisterRequest { node_id: conf.id, listen_pg_addr: m.postgres_host, listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + availability_zone_id: az_id.expect("Checked above"), }) } Err(e) => { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 5e4a49bc56..a58fa2c0b1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,19 +41,15 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::serde_percent::Percent; use utils::{completion, id::TimelineId}; use crate::{ @@ -69,23 +65,9 @@ use crate::{ CancellableTask, DiskUsageEvictionTask, }; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DiskUsageEvictionTaskConfig { - pub max_usage_pct: Percent, - pub min_avail_bytes: u64, - #[serde(with = "humantime_serde")] - pub period: Duration, - #[cfg(feature = "testing")] - pub mock_statvfs: Option, - /// Select sorting for evicted layers - #[serde(default)] - pub eviction_order: EvictionOrder, -} - /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "type", content = "args")] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. @@ -96,23 +78,22 @@ pub enum EvictionOrder { /// we read tenants is deterministic. If we find the need to use this as `false`, we need /// to ensure nondeterminism by adding in a random number to break the /// `relative_last_activity==0.0` ties. - #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } -impl Default for EvictionOrder { - fn default() -> Self { - Self::RelativeAccessed { - highest_layer_count_loses_first: true, +impl From for EvictionOrder { + fn from(value: pageserver_api::config::EvictionOrder) -> Self { + match value { + pageserver_api::config::EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => Self::RelativeAccessed { + highest_layer_count_loses_first, + }, } } } -fn default_highest_layer_count_loses_first() -> bool { - true -} - impl EvictionOrder { fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; @@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration( storage, usage_pre, tenant_manager, - task_config.eviction_order, + task_config.eviction_order.into(), cancel, ) .await; @@ -1257,7 +1238,6 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -1269,7 +1249,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: EvictionOrder::default(), + eviction_order: pageserver_api::config::EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a4da8506d6..d645f3b7b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -318,6 +318,27 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(value: crate::tenant::TimelineArchivalError) -> Self { + use crate::tenant::TimelineArchivalError::*; + match value { + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), + Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + e @ HasArchivedParent(_) => { + ApiError::PreconditionFailed(e.to_string().into_boxed_str()) + } + HasUnarchivedChildren(children) => ApiError::PreconditionFailed( + format!( + "Cannot archive timeline which has non-archived child timelines: {children:?}" + ) + .into_boxed_str(), + ), + a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), + Other(e) => ApiError::InternalServerError(e), + } + } +} + impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { use crate::tenant::mgr::DeleteTimelineError::*; @@ -405,6 +426,8 @@ async fn build_timeline_info_common( let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); + // Report is_archived = false if the timeline is still loading + let is_archived = timeline.is_archived().unwrap_or(false); let remote_consistent_lsn_projected = timeline .get_remote_consistent_lsn_projected() .unwrap_or(Lsn(0)); @@ -445,6 +468,7 @@ async fn build_timeline_info_common( pg_version: timeline.pg_version, state, + is_archived: Some(is_archived), walreceiver_status, @@ -686,9 +710,7 @@ async fn timeline_archival_config_handler( tenant .apply_timeline_archival_config(timeline_id, request_data.state) - .await - .context("applying archival config") - .map_err(ApiError::InternalServerError)?; + .await?; Ok::<_, ApiError>(()) } .instrument(info_span!("timeline_archival_config", @@ -852,7 +874,10 @@ async fn get_timestamp_of_lsn_handler( match result { Some(time) => { - let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); + let time = format_rfc3339( + postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?, + ) + .to_string(); json_response(StatusCode::OK, time) } None => Err(ApiError::NotFound( @@ -1706,13 +1731,12 @@ async fn timeline_compact_handler( flags |= CompactFlags::ForceImageLayerCreation; } if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? { - if !cfg!(feature = "testing") { - return Err(ApiError::InternalServerError(anyhow!( - "enhanced_gc_bottom_most_compaction is only available in testing mode" - ))); - } flags |= CompactFlags::EnhancedGcBottomMostCompaction; } + if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { + flags |= CompactFlags::DryRun; + } + let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -2052,7 +2076,7 @@ async fn disk_usage_eviction_run( evict_bytes: u64, #[serde(default)] - eviction_order: crate::disk_usage_eviction_task::EvictionOrder, + eviction_order: pageserver_api::config::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] @@ -2088,7 +2112,7 @@ async fn disk_usage_eviction_run( &state.remote_storage, usage, &state.tenant_manager, - config.eviction_order, + config.eviction_order.into(), &cancel, ) .await; @@ -2330,6 +2354,20 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } +async fn put_io_alignment_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let align: usize = json_request(&mut r).await?; + crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| { + ApiError::PreconditionFailed( + format!("Requested io alignment ({align}) is not a power of two").into(), + ) + })?; + json_response(StatusCode::OK, ()) +} + /// Polled by control plane. /// /// See [`crate::utilization`]. @@ -2942,7 +2980,7 @@ pub fn make_router( ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", - |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler), + |r| api_handler(r, timeline_compact_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", @@ -3017,6 +3055,9 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put("/v1/io_alignment", |r| { + api_handler(r, put_io_alignment_handler) + }) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", |r| api_handler(r, force_aux_policy_switch_handler), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ed409d3130..ca87f1d080 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::decode_wal_record; use crate::walrecord::DecodedWALRecord; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; @@ -310,11 +311,13 @@ async fn import_wal( let mut nrecords = 0; let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; + walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; WAL_INGEST.records_committed.inc(); @@ -449,11 +452,12 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; modification.commit(ctx).await?; last_lsn = lsn; @@ -576,9 +580,11 @@ async fn import_file( import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader).await?; + + // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, + // it's a 32-bit TransactionId, which fits in u64 anyway. + let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 313a7961a6..491c9fb96c 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,9 +1,7 @@ use std::{num::NonZeroUsize, sync::Arc}; -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +#[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { - #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, } @@ -16,6 +14,16 @@ impl Default for L0FlushConfig { } } +impl From for L0FlushConfig { + fn from(config: pageserver_api::models::L0FlushConfig) -> Self { + match config { + pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => { + Self::Direct { max_concurrency } + } + } + } +} + #[derive(Clone)] pub struct L0FlushGlobalState(Arc); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index dbfc9f3544..7a9cf495c7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -16,6 +16,7 @@ pub mod l0_flush; use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; +mod assert_u64_eq_usize; pub mod aux_file; pub mod metrics; pub mod page_cache; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 0a1a22b6e8..72229d80be 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -9,7 +9,7 @@ use metrics::{ use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, VariantNames}; -use strum_macros::{EnumVariantNames, IntoStaticStr}; +use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; use utils::id::TimelineId; @@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[ ]; // Metrics collected on operations on the storage repository. -#[derive(Debug, EnumVariantNames, IntoStaticStr)] +#[derive(Debug, VariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] @@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] pub(crate) enum ComputeCommandKind { PageStreamV2, - PageStream, Basebackup, Fullbackup, LeaseLsn, @@ -1778,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { .expect("failed to define a metric"), upload_heatmap_duration: register_histogram!( "pageserver_secondary_upload_heatmap_duration", - "Time to build and upload a heatmap, including any waiting inside the S3 client" + "Time to build and upload a heatmap, including any waiting inside the remote storage client" ) .expect("failed to define a metric"), download_heatmap: register_int_counter!( @@ -1803,6 +1802,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static NODE_UTILIZATION_SCORE: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_utilization_score", + "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded", + ) + .expect("failed to define a metric") +}); + pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_secondary_heatmap_total_size", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 81294291a9..9261b7481d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -557,7 +557,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, - protocol_version: PagestreamProtocolVersion, + _protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where @@ -601,8 +601,7 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message"); // parse request - let neon_fe_msg = - PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; // invoke handler function let (handler_result, span) = match neon_fe_msg { @@ -754,16 +753,21 @@ impl PageServerHandler { } if request_lsn < **latest_gc_cutoff_lsn { - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { - PageStreamError::BadRequest(format!( + let gc_info = &timeline.gc_info.read().unwrap(); + if !gc_info.leases.contains_key(&request_lsn) { + // The requested LSN is below gc cutoff and is not guarded by a lease. + + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) + } else { + PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + }); + } } // Wait for WAL up to 'not_modified_since' to arrive, if necessary @@ -790,6 +794,8 @@ impl PageServerHandler { } } + /// Handles the lsn lease request. + /// If a lease cannot be obtained, the client will receive NULL. #[instrument(skip_all, fields(shard_id, %lsn))] async fn handle_make_lsn_lease( &mut self, @@ -812,19 +818,25 @@ impl PageServerHandler { .await?; set_tracing_field_shard_id(&timeline); - let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?; - let valid_until = lease - .valid_until - .duration_since(SystemTime::UNIX_EPOCH) - .map_err(|e| QueryError::Other(e.into()))?; + let lease = timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) + .inspect_err(|e| { + warn!("{e}"); + }) + .ok(); + let valid_until_str = lease.map(|l| { + l.valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .expect("valid_until is earlier than UNIX_EPOCH") + .as_millis() + .to_string() + }); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"valid_until", )]))? - .write_message_noflush(&BeMessage::DataRow(&[Some( - &valid_until.as_millis().to_be_bytes(), - )]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::DataRow(&[bytes]))?; Ok(()) } @@ -1187,7 +1199,6 @@ impl PageServerHandler { } } -#[async_trait::async_trait] impl postgres_backend::Handler for PageServerHandler where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, @@ -1275,35 +1286,6 @@ where ctx, ) .await?; - } else if let Some(params) = parts.strip_prefix(&["pagestream"]) { - if params.len() != 2 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for pagestream command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::PageStream) - .inc(); - - self.handle_pagerequests( - pgb, - tenant_id, - timeline_id, - PagestreamProtocolVersion::V1, - ctx, - ) - .await?; } else if let Some(params) = parts.strip_prefix(&["basebackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4f7eb1a00c..5f8766ca2c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -15,12 +15,11 @@ use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; -use itertools::Itertools; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::models::AuxFilePolicy; @@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; -use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. @@ -170,10 +168,13 @@ impl Timeline { DatadirModification { tline: self, pending_lsns: Vec::new(), - pending_updates: HashMap::new(), + pending_metadata_pages: HashMap::new(), + pending_data_pages: Vec::new(), + pending_zero_data_pages: Default::default(), pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), + pending_bytes: 0, lsn, } } @@ -632,7 +633,7 @@ impl Timeline { pub(crate) async fn get_twophase_file( &self, - xid: TransactionId, + xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { @@ -645,11 +646,19 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - Ok(TwoPhaseDirectory::des(&buf)?.xids) + if self.pg_version >= 17 { + Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) + } else { + Ok(TwoPhaseDirectory::des(&buf)? + .xids + .iter() + .map(|x| u64::from(*x)) + .collect()) + } } pub(crate) async fn get_control_file( @@ -727,7 +736,21 @@ impl Timeline { ) -> Result, PageReconstructError> { let current_policy = self.last_aux_file_policy.load(); match current_policy { - Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await, + Some(AuxFilePolicy::V1) => { + let res = self.list_aux_files_v1(lsn, ctx).await?; + let empty_str = if res.is_empty() { ", empty" } else { "" }; + warn!( + "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})" + ); + Ok(res) + } + None => { + let res = self.list_aux_files_v1(lsn, ctx).await?; + if !res.is_empty() { + warn!("this timeline is using deprecated aux file policy V1 (policy=None)"); + } + Ok(res) + } Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, Some(AuxFilePolicy::CrossValidation) => { let v1_result = self.list_aux_files_v1(lsn, ctx).await; @@ -887,9 +910,13 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; - let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + + let mut xids: Vec = self + .list_twophase_files(lsn, ctx) + .await? + .iter() + .cloned() + .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); @@ -1006,9 +1033,10 @@ impl Timeline { } /// DatadirModification represents an operation to ingest an atomic set of -/// updates to the repository. It is created by the 'begin_record' -/// function. It is called for each WAL record, so that all the modifications -/// by a one WAL record appear atomic. +/// updates to the repository. +/// +/// It is created by the 'begin_record' function. It is called for each WAL +/// record, so that all the modifications by a one WAL record appear atomic. pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected @@ -1022,21 +1050,51 @@ pub struct DatadirModification<'a> { // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, - pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications + /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'. + pending_metadata_pages: HashMap>, + + /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for + /// which keys are stored here. + pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>, + + // Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However, + // if we encounter a write from postgres in the same wal record, we will drop this entry. + // + // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed + // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn + pending_zero_data_pages: HashSet, + /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. pending_directory_entries: Vec<(DirectoryKind, usize)>, + + /// An **approximation** of how large our EphemeralFile write will be when committed. + pending_bytes: usize, } impl<'a> DatadirModification<'a> { + // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can + // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we + // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. + pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024; + /// Get the current lsn pub(crate) fn get_lsn(&self) -> Lsn { self.lsn } + pub(crate) fn approx_pending_bytes(&self) -> usize { + self.pending_bytes + } + + pub(crate) fn has_dirty_data_pages(&self) -> bool { + (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty()) + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -1045,6 +1103,10 @@ impl<'a> DatadirModification<'a> { lsn, self.lsn ); + + // If we are advancing LSN, then state from previous wal record should have been flushed. + assert!(self.pending_zero_data_pages.is_empty()); + if lsn > self.lsn { self.pending_lsns.push(self.lsn); self.lsn = lsn; @@ -1052,6 +1114,17 @@ impl<'a> DatadirModification<'a> { Ok(()) } + /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means + /// keys that represent literal blocks that postgres can read. So data includes relation blocks and + /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata. + /// + /// The distinction is important because data keys are handled on a fast path where dirty writes are + /// not readable until this modification is committed, whereas metadata keys are visible for read + /// via [`Self::get`] as soon as their record has been ingested. + fn is_data_key(key: &Key) -> bool { + key.is_rel_block_key() || key.is_slru_block_key() + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -1066,9 +1139,15 @@ impl<'a> DatadirModification<'a> { // Create AuxFilesDirectory self.init_aux_dir()?; - let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { - xids: HashSet::new(), - })?; + let buf = if self.tline.pg_version >= 17 { + TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { + xids: HashSet::new(), + }) + } else { + TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + }) + }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); @@ -1144,6 +1223,13 @@ impl<'a> DatadirModification<'a> { img: Bytes, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -1155,10 +1241,63 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { - self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } + self.put(key, Value::Image(img)); Ok(()) } + pub(crate) fn put_rel_page_image_zero( + &mut self, + rel: RelTag, + blknum: BlockNumber, + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + Ok(()) + } + + pub(crate) fn put_slru_page_image_zero( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) -> anyhow::Result<()> { + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + Ok(()) + } + + /// Call this at the end of each WAL record. + pub(crate) fn on_record_end(&mut self) { + let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages); + for key in pending_zero_data_pages { + self.put_data(key, Value::Image(ZERO_PAGE.clone())); + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1200,22 +1339,31 @@ impl<'a> DatadirModification<'a> { pub async fn put_twophase_file( &mut self, - xid: TransactionId, + xid: u64, img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; - if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid = xid as u32; + let mut dir = TwoPhaseDirectory::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) @@ -1518,22 +1666,32 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, - xid: TransactionId, + xid: u64, ctx: &RequestContext, ) -> anyhow::Result<()> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&buf)?; - if !dir.xids.remove(&xid) { - warn!("twophase file for xid {} does not exist", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid: u32 = u32::try_from(xid)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); @@ -1576,6 +1734,7 @@ impl<'a> DatadirModification<'a> { if aux_files_key_v1.is_empty() { None } else { + warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)"); self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; Some(AuxFilePolicy::V1) } @@ -1756,7 +1915,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1767,23 +1926,12 @@ impl<'a> DatadirModification<'a> { let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); - for (key, values) in self.pending_updates.drain() { - for (lsn, value) in values { - if key.is_rel_block_key() || key.is_slru_block_key() { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - writer.put(key, lsn, &value, ctx).await?; - } else { - retained_pending_updates - .entry(key) - .or_default() - .push((lsn, value)); - } - } - } + let pending_data_pages = std::mem::take(&mut self.pending_data_pages); - self.pending_updates = retained_pending_updates; + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put_batch(pending_data_pages, ctx).await?; + self.pending_bytes = 0; if pending_nblocks != 0 { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); @@ -1803,23 +1951,31 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + // Commit should never be called mid-wal-record + assert!(self.pending_zero_data_pages.is_empty()); + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - if !self.pending_updates.is_empty() { - // The put_batch call below expects expects the inputs to be sorted by Lsn, - // so we do that first. - let lsn_ordered_batch: VecMap = VecMap::from_iter( - self.pending_updates - .drain() - .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val)))) - .kmerge_by(|lhs, rhs| lhs.0 < rhs.0), - VecMapOrdering::GreaterOrEqual, - ); + // Ordering: the items in this batch do not need to be in any global order, but values for + // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on + // this to do efficient updates to its index. + let mut write_batch = std::mem::take(&mut self.pending_data_pages); - writer.put_batch(lsn_ordered_batch, ctx).await?; + write_batch.extend( + self.pending_metadata_pages + .drain() + .flat_map(|(key, values)| { + values + .into_iter() + .map(move |(lsn, value_size, value)| (key, lsn, value_size, value)) + }), + ); + + if !write_batch.is_empty() { + writer.put_batch(write_batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1844,37 +2000,64 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } + self.pending_bytes = 0; + Ok(()) } pub(crate) fn len(&self) -> usize { - self.pending_updates.len() + self.pending_deletions.len() + self.pending_metadata_pages.len() + + self.pending_data_pages.len() + + self.pending_deletions.len() } - // Internal helper functions to batch the modifications - + /// Read a page from the Timeline we are writing to. For metadata pages, this passes through + /// a cache in Self, which makes writes earlier in this modification visible to WAL records later + /// in the modification. + /// + /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data + /// page must ensure that the pages they read are already committed in Timeline, for example + /// DB create operations are always preceded by a call to commit(). This is special cased because + /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes, + /// and not data pages. async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the latest pending updated - // version in that case. - // - // Note: we don't check pending_deletions. It is an error to request a - // value that has been removed, deletion only avoids leaking storage. - if let Some(values) = self.pending_updates.get(&key) { - if let Some((_, value)) = values.last() { - return if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::Other(anyhow::anyhow!( - "unexpected pending WAL record" - ))) - }; + if !Self::is_data_key(&key) { + // Have we already updated the same key? Read the latest pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) { + if let Some((_, _, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::Other(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; + } + } + } else { + // This is an expensive check, so we only do it in debug mode. If reading a data key, + // this key should never be present in pending_data_pages. We ensure this by committing + // modifications before ingesting DB create operations, which are the only kind that reads + // data pages during ingest. + if cfg!(debug_assertions) { + for (dirty_key, _, _, _) in &self.pending_data_pages { + debug_assert!(&key.to_compact() != dirty_key); + } + + debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact())) } } + + // Metadata page cache miss, or we're reading a data page. let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); self.tline.get(key, lsn, ctx).await } @@ -1886,15 +2069,48 @@ impl<'a> DatadirModification<'a> { } fn put(&mut self, key: Key, val: Value) { - let values = self.pending_updates.entry(key).or_default(); + if Self::is_data_key(&key) { + self.put_data(key.to_compact(), val) + } else { + self.put_metadata(key.to_compact(), val) + } + } + + fn put_data(&mut self, key: CompactKey, val: Value) { + let val_serialized_size = val.serialized_size().unwrap() as usize; + + // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This + // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend), + // and the subsequent postgres-originating write + if self.pending_zero_data_pages.remove(&key) { + self.pending_bytes -= ZERO_PAGE.len(); + } + + self.pending_bytes += val_serialized_size; + self.pending_data_pages + .push((key, self.lsn, val_serialized_size, val)) + } + + fn put_metadata(&mut self, key: CompactKey, val: Value) { + let values = self.pending_metadata_pages.entry(key).or_default(); // Replace the previous value if it exists at the same lsn - if let Some((last_lsn, last_value)) = values.last_mut() { + if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { + // Update the pending_bytes contribution from this entry, and update the serialized size in place + self.pending_bytes -= *last_value_ser_size; + *last_value_ser_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += *last_value_ser_size; + + // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much + // have been generated by synthesized zero page writes prior to the first real write to a page. *last_value = val; return; } } - values.push((self.lsn, val)); + + let val_serialized_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += val_serialized_size; + values.push((self.lsn, val_serialized_size, val)); } fn delete(&mut self, key_range: Range) { @@ -1905,6 +2121,7 @@ impl<'a> DatadirModification<'a> { /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. +/// /// During WAL ingestion, the records from multiple LSNs may be batched in the same /// modification before being flushed to the timeline. Hence, the routines in WalIngest /// need to look up the keys in the modification first before looking them up in the @@ -1944,11 +2161,21 @@ struct DbDirectory { dbdirs: HashMap<(Oid, Oid), bool>, } +// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of +// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files +// were named like "pg_twophase/000002E5", now they're like +// "pg_twophsae/0000000A000002E4". + #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectory { xids: HashSet, } +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectoryV17 { + xids: HashSet, +} + #[derive(Debug, Serialize, Deserialize, Default)] struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) @@ -2024,7 +2251,7 @@ mod tests { let (tenant, ctx) = harness.load().await; let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index ede1791afa..5a6f6e5176 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -60,32 +60,7 @@ pub mod mock { use regex::Regex; use tracing::log::info; - #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[serde(tag = "type")] - pub enum Behavior { - Success { - blocksize: u64, - total_blocks: u64, - name_filter: Option, - }, - Failure { - mocked_error: MockedError, - }, - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[allow(clippy::upper_case_acronyms)] - pub enum MockedError { - EIO, - } - - impl From for nix::Error { - fn from(e: MockedError) -> Self { - match e { - MockedError::EIO => nix::Error::EIO, - } - } - } + pub use pageserver_api::config::statvfs::mock::Behavior; pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -116,6 +91,7 @@ pub mod mock { block_size: *blocksize, }) } + #[cfg(feature = "testing")] Behavior::Failure { mocked_error } => Err((*mocked_error).into()), } } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index ed9e001fd2..6a4e90dd55 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode { } } +static TOKIO_THREAD_STACK_SIZE: Lazy = Lazy::new(|| { + env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE") + // the default 2MiB are insufficent, especially in debug mode + .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap()) +}); + static ONE_RUNTIME: Lazy> = Lazy::new(|| { let thread_name = "pageserver-tokio"; let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { @@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy> = Lazy::new(|| { tokio::runtime::Builder::new_current_thread() .thread_name(thread_name) .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one single runtime") } @@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy> = Lazy::new(|| { .thread_name(thread_name) .enable_all() .worker_threads(num_workers.get()) + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one multi-threaded runtime") } @@ -199,6 +207,7 @@ macro_rules! pageserver_runtime { .thread_name($name) .worker_threads(TOKIO_WORKER_THREADS.get()) .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect(std::concat!("Failed to create runtime ", $name)) }); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 65a7504b74..c6f0e48101 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1,8 +1,9 @@ +//! Timeline repository implementation that keeps old data in layer files, and +//! the recent changes in ephemeral files. //! -//! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See tenant/*_layer.rs files. -//! The functions here are responsible for locating the correct layer for the -//! get/put call, walking back the timeline branching history as needed. +//! See tenant/*_layer.rs files. The functions here are responsible for locating +//! the correct layer for the get/put call, walking back the timeline branching +//! history as needed. //! //! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. @@ -501,6 +502,42 @@ impl Debug for DeleteTimelineError { } } +#[derive(thiserror::Error)] +pub enum TimelineArchivalError { + #[error("NotFound")] + NotFound, + + #[error("Timeout")] + Timeout, + + #[error("ancestor is archived: {}", .0)] + HasArchivedParent(TimelineId), + + #[error("HasUnarchivedChildren")] + HasUnarchivedChildren(Vec), + + #[error("Timeline archival is already in progress")] + AlreadyInProgress, + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl Debug for TimelineArchivalError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "NotFound"), + Self::Timeout => write!(f, "Timeout"), + Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), + Self::HasUnarchivedChildren(c) => { + f.debug_tuple("HasUnarchivedChildren").field(c).finish() + } + Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(), + Self::Other(e) => f.debug_tuple("Other").field(e).finish(), + } + } +} + pub enum SetStoppingError { AlreadyStopping(completion::Barrier), Broken, @@ -845,6 +882,12 @@ impl Tenant { }); }; + // TODO: should also be rejecting tenant conf changes that violate this check. + if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + return Ok(()); + } + let mut init_order = init_order; // take the completion because initial tenant loading will complete when all of // these tasks complete. @@ -1326,24 +1369,59 @@ impl Tenant { &self, timeline_id: TimelineId, state: TimelineArchivalState, - ) -> anyhow::Result<()> { - let timeline = self - .get_timeline(timeline_id, false) - .context("Cannot apply timeline archival config to inexistent timeline")?; + ) -> Result<(), TimelineArchivalError> { + info!("setting timeline archival config"); + let timeline = { + let timelines = self.timelines.lock().unwrap(); + + let Some(timeline) = timelines.get(&timeline_id) else { + return Err(TimelineArchivalError::NotFound); + }; + + if state == TimelineArchivalState::Unarchived { + if let Some(ancestor_timeline) = timeline.ancestor_timeline() { + if ancestor_timeline.is_archived() == Some(true) { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline.timeline_id, + )); + } + } + } + + // Ensure that there are no non-archived child timelines + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() != Some(timeline_id) { + return None; + } + if entry.is_archived() == Some(true) { + return None; + } + Some(*id) + }) + .collect(); + + if !children.is_empty() && state == TimelineArchivalState::Archived { + return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + } + Arc::clone(timeline) + }; let upload_needed = timeline .remote_client .schedule_index_upload_for_timeline_archival_state(state)?; if upload_needed { + info!("Uploading new state"); const MAX_WAIT: Duration = Duration::from_secs(10); let Ok(v) = tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await else { tracing::warn!("reached timeout for waiting on upload queue"); - bail!("reached timeout for upload queue flush"); + return Err(TimelineArchivalError::Timeout); }; - v?; + v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; } Ok(()) } @@ -3741,13 +3819,21 @@ impl Tenant { /// less than this (via eviction and on-demand downloads), but this function enables /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O /// by keeping important things on local disk. + /// + /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less + /// than they report here, due to layer eviction. Tenants with many active branches may + /// actually use more than they report here. pub(crate) fn local_storage_wanted(&self) -> u64 { - let mut wanted = 0; let timelines = self.timelines.lock().unwrap(); - for timeline in timelines.values() { - wanted += timeline.metrics.visible_physical_size_gauge.get(); - } - wanted + + // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This + // reflects the observation that on tenants with multiple large branches, typically only one + // of them is used actively enough to occupy space on disk. + timelines + .values() + .map(|t| t.metrics.visible_physical_size_gauge.get()) + .max() + .unwrap_or(0) } } @@ -5932,10 +6018,10 @@ mod tests { .await .unwrap(); - // the default aux file policy to switch is v1 if not set by the admins + // the default aux file policy to switch is v2 if not set by the admins assert_eq!( harness.tenant_conf.switch_aux_file_policy, - AuxFilePolicy::V1 + AuxFilePolicy::default_tenant_config() ); let (tenant, ctx) = harness.load().await; @@ -5979,8 +6065,8 @@ mod tests { ); assert_eq!( tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V1), - "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1" + Some(AuxFilePolicy::V2), + "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there" ); // we can read everything from the storage @@ -6002,8 +6088,8 @@ mod tests { assert_eq!( tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V1), - "keep v1 storage format when new files are written" + Some(AuxFilePolicy::V2), + "keep v2 storage format when new files are written" ); let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); @@ -6019,7 +6105,7 @@ mod tests { // child copies the last flag even if that is not on remote storage yet assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); - assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1)); + assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); let files = child.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); @@ -7005,18 +7091,14 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: { - let mut key = Key::MAX; - key.field6 -= 1; - Key::MIN..key - }, + key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, - // The delta layer that is cut in the middle + // The delta layer below the horizon PersistentLayerKey { key_range: get_key(3)..get_key(4), - lsn_range: Lsn(0x30)..Lsn(0x41), + lsn_range: Lsn(0x30)..Lsn(0x48), is_delta: true }, // The delta3 layer that should not be picked for the compaction @@ -7996,6 +8078,214 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()> + { + let harness = + TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key") + .await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + ), + ]; + let delta2 = vec![ + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(1), + Lsn(0x38), + Value::WalRecord(NeonWalRecord::wal_append("@0x38")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + // delta1 and delta 2 only contain a single key but multiple updates + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id), + (Lsn(0x20), tline.timeline_id), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + let mut dryrun_flags = EnumSet::new(); + dryrun_flags.insert(CompactFlags::DryRun); + + tline + .compact_with_gc(&cancel, dryrun_flags, &ctx) + .await + .unwrap(); + // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs + // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests. + verify_result().await; + + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + // compact again + tline + .compact_with_gc(&cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + verify_result().await; + + Ok(()) + } + #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index a245c99a88..dd70f6bbff 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; /// The maximum size of blobs we support. The highest few bits /// are reserved for compression and other further uses. -const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; +pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff; pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; @@ -326,7 +326,7 @@ impl BlobWriter { (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } else { // Write a 4-byte length header - if len > MAX_SUPPORTED_LEN { + if len > MAX_SUPPORTED_BLOB_LEN { return ( ( io_buf.slice_len(), diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 601b095155..3afa3a86b9 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,7 +2,6 @@ //! Low-level Block-oriented I/O functions //! -use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; @@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> { /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), - EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), - Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> { use BlockReaderRef::*; match self { FileBlockReader(r) => r.read_blk(blknum, ctx).await, - EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, - Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> { } } -impl<'a> BlockReaderRef<'a> { - fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { - let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); - let end = start.checked_add(PAGE_SZ).unwrap(); - if end > slice.len() { - return Err(std::io::Error::new( - std::io::ErrorKind::UnexpectedEof, - format!("slice too short, len={} end={}", slice.len(), end), - )); - } - let slice = &slice[start..end]; - let page_sized: &[u8; PAGE_SZ] = slice - .try_into() - .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); - Ok(BlockLease::Slice(page_sized)) - } -} - /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48ff17db94..547b43a399 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,11 +9,10 @@ //! may lead to a data loss. //! use anyhow::bail; +pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::AuxFilePolicy; -use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::LsnLease; use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; @@ -23,50 +22,6 @@ use std::num::NonZeroU64; use std::time::Duration; use utils::generation::Generation; -pub mod defaults { - - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - // This parameter actually determines L0 layer file size. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; - - // FIXME the below configs are only used by legacy algorithm. The new algorithm - // has different parameters. - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - - pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; - pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm = - super::CompactionAlgorithm::Legacy; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - - // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. - // If there's a need to decrease this value, first make sure that GC - // doesn't hold a layer map write lock for non-trivial operations. - // Relevant: https://github.com/neondatabase/neon/issues/3394 - pub const DEFAULT_GC_PERIOD: &str = "1 hr"; - pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; - pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - // The default limit on WAL lag should be set to avoid causing disconnects under high throughput - // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for - // throughputs up to 1GiB/s per timeline. - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; - // By default ingest enough WAL for two new L0 layers before checking if new image - // image layers should be created. - pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; -} - #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached @@ -281,96 +236,20 @@ impl LocationConf { } } -/// A tenant's calcuated configuration, which is the result of merging a -/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. -/// -/// For storing and transmitting individual tenant's configuration, see -/// TenantConfOpt. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct TenantConf { - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - // This parameter actually determines L0 layer file size. - pub checkpoint_distance: u64, - // Inmemory layer is also flushed at least once in checkpoint_timeout to - // eventually upload WAL after activity is stopped. - #[serde(with = "humantime_serde")] - pub checkpoint_timeout: Duration, - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub compaction_target_size: u64, - // How often to check if there's compaction work to be done. - // Duration::ZERO means automatic compaction is disabled. - #[serde(with = "humantime_serde")] - pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. - pub compaction_threshold: usize, - pub compaction_algorithm: CompactionAlgorithmSettings, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is #of bytes of WAL. - // Page versions older than this are garbage collected away. - pub gc_horizon: u64, - // Interval at which garbage collection is triggered. - // Duration::ZERO means automatic GC is disabled - #[serde(with = "humantime_serde")] - pub gc_period: Duration, - // Delta layer churn threshold to create L1 image layers. - pub image_creation_threshold: usize, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is time. - // Page versions older than this are garbage collected away. - #[serde(with = "humantime_serde")] - pub pitr_interval: Duration, - /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. - #[serde(with = "humantime_serde")] - pub walreceiver_connect_timeout: Duration, - /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. - /// A stalled safekeeper will be changed to a newer one when it appears. - #[serde(with = "humantime_serde")] - pub lagging_wal_timeout: Duration, - /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. - /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, - /// to avoid eager reconnects. - pub max_lsn_wal_lag: NonZeroU64, - pub eviction_policy: EvictionPolicy, - pub min_resident_size_override: Option, - // See the corresponding metric's help string. - #[serde(with = "humantime_serde")] - pub evictions_low_residence_duration_metric_threshold: Duration, - - /// If non-zero, the period between uploads of a heatmap from attached tenants. This - /// may be disabled if a Tenant will not have secondary locations: only secondary - /// locations will use the heatmap uploaded by attached locations. - #[serde(with = "humantime_serde")] - pub heatmap_period: Duration, - - /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup - pub lazy_slru_download: bool, - - pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, - - // How much WAL must be ingested before checking again whether a new image layer is required. - // Expresed in multiples of checkpoint distance. - pub image_layer_creation_check_threshold: u8, - - /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into - /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. - /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux - /// file is written. - pub switch_aux_file_policy: AuxFilePolicy, - - /// The length for an explicit LSN lease request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length: Duration, - - /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length_for_ts: Duration, +impl Default for LocationConf { + // TODO: this should be removed once tenant loading can guarantee that we are never + // loading from a directory without a configuration. + // => tech debt since https://github.com/neondatabase/neon/issues/1555 + fn default() -> Self { + Self { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: Generation::none(), + attach_mode: AttachmentMode::Single, + }), + tenant_conf: TenantConfOpt::default(), + shard: ShardIdentity::unsharded(), + } + } } /// Same as TenantConf, but this struct preserves the information about @@ -545,51 +424,6 @@ impl TenantConfOpt { } } -impl Default for TenantConf { - fn default() -> Self { - use defaults::*; - Self { - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) - .expect("cannot parse default checkpoint timeout"), - compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, - compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) - .expect("cannot parse default compaction period"), - compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - compaction_algorithm: CompactionAlgorithmSettings { - kind: DEFAULT_COMPACTION_ALGORITHM, - }, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) - .expect("cannot parse default gc period"), - image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) - .expect("cannot parse default PITR interval"), - walreceiver_connect_timeout: humantime::parse_duration( - DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .expect("cannot parse default walreceiver connect timeout"), - lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) - .expect("cannot parse default walreceiver lagging wal timeout"), - max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .expect("cannot parse default max walreceiver Lsn wal lag"), - eviction_policy: EvictionPolicy::NoEviction, - min_resident_size_override: None, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - heatmap_period: Duration::ZERO, - lazy_slru_download: false, - timeline_get_throttle: crate::tenant::throttle::Config::disabled(), - image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), - lsn_lease_length: LsnLease::DEFAULT_LENGTH, - lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - } - } -} - impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { type Error = anyhow::Error; @@ -618,7 +452,8 @@ impl TryFrom for TenantConfOpt { .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } toml_edit::Item::Table(table) => { - let deserializer = toml_edit::de::Deserializer::new(table.into()); + let deserializer = + toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table)); return serde_path_to_error::deserialize(deserializer) .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 3eb8384d05..5324e1807d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,13 +1,21 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; -use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; -use crate::virtual_file::{self, VirtualFile}; +use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; +use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; +use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; +use crate::virtual_file::owned_buffers_io::write::Buffer; +use crate::virtual_file::{self, owned_buffers_io, VirtualFile}; +use bytes::BytesMut; use camino::Utf8PathBuf; +use num_traits::Num; use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; use std::io; use std::sync::atomic::AtomicU64; @@ -16,12 +24,17 @@ use utils::id::TimelineId; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - - rw: page_caching::RW, + page_cache_file_id: page_cache::FileId, + bytes_written: u64, + buffered_writer: owned_buffers_io::write::BufferedWriter< + BytesMut, + size_tracking_writer::Writer, + >, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop) + _gate_guard: utils::sync::gate::GateGuard, } -mod page_caching; -mod zero_padded_read_write; +const TAIL_SZ: usize = 64 * 1024; impl EphemeralFile { pub async fn create( @@ -51,60 +64,178 @@ impl EphemeralFile { ) .await?; + let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore + Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file, gate_guard), + page_cache_file_id, + bytes_written: 0, + buffered_writer: owned_buffers_io::write::BufferedWriter::new( + size_tracking_writer::Writer::new(file), + BytesMut::with_capacity(TAIL_SZ), + ), + _gate_guard: gate_guard, }) } +} +impl Drop for EphemeralFile { + fn drop(&mut self) { + // unlink the file + // we are clear to do this, because we have entered a gate + let path = &self.buffered_writer.as_inner().as_inner().path; + let res = std::fs::remove_file(path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!("could not remove ephemeral file '{path}': {e}"); + } + } + } +} + +impl EphemeralFile { pub(crate) fn len(&self) -> u64 { - self.rw.bytes_written() + self.bytes_written } pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { - self.rw.page_cache_file_id() + self.page_cache_file_id } - /// See [`self::page_caching::RW::load_to_vec`]. pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { - self.rw.load_to_vec(ctx).await + let size = self.len().into_usize(); + let vec = Vec::with_capacity(size); + let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?; + assert_eq!(nread, size); + let vec = slice.into_inner(); + assert_eq!(vec.len(), nread); + assert_eq!(vec.capacity(), size, "we shouldn't be reallocating"); + Ok(vec) } - pub(crate) async fn read_blk( - &self, - blknum: u32, - ctx: &RequestContext, - ) -> Result { - self.rw.read_blk(blknum, ctx).await - } - - pub(crate) async fn write_blob( + /// Returns the offset at which the first byte of the input was written, for use + /// in constructing indices over the written value. + /// + /// Panics if the write is short because there's no way we can recover from that. + /// TODO: make upstack handle this as an error. + pub(crate) async fn write_raw( &mut self, srcbuf: &[u8], ctx: &RequestContext, - ) -> Result { - let pos = self.rw.bytes_written(); + ) -> std::io::Result { + let pos = self.bytes_written; - // Write the length field - if srcbuf.len() < 0x80 { - // short one-byte length header - let len_buf = [srcbuf.len() as u8]; - - self.rw.write_all_borrowed(&len_buf, ctx).await?; - } else { - let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); - len_buf[0] |= 0x80; - self.rw.write_all_borrowed(&len_buf, ctx).await?; - } + let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}", + srcbuf_len = srcbuf.len(), + ), + ) + })?; // Write the payload - self.rw.write_all_borrowed(srcbuf, ctx).await?; + let nwritten = self + .buffered_writer + .write_buffered_borrowed(srcbuf, ctx) + .await?; + assert_eq!( + nwritten, + srcbuf.len(), + "buffered writer has no short writes" + ); + + self.bytes_written = new_bytes_written; Ok(pos) } } +impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { + async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>( + &'b self, + start: u64, + dst: tokio_epoll_uring::Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { + let file_size_tracking_writer = self.buffered_writer.as_inner(); + let flushed_offset = file_size_tracking_writer.bytes_written(); + + let buffer = self.buffered_writer.inspect_buffer(); + let buffered = &buffer[0..buffer.pending()]; + + let dst_cap = dst.bytes_total().into_u64(); + let end = { + // saturating_add is correct here because the max file size is u64::MAX, so, + // if start + dst.len() > u64::MAX, then we know it will be a short read + let mut end: u64 = start.saturating_add(dst_cap); + if end > self.bytes_written { + end = self.bytes_written; + } + end + }; + + // inclusive, exclusive + #[derive(Debug)] + struct Range(N, N); + impl Range { + fn len(&self) -> N { + if self.0 > self.1 { + N::zero() + } else { + self.1 - self.0 + } + } + } + let written_range = Range(start, std::cmp::min(end, flushed_offset)); + let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + let dst = if written_range.len() > 0 { + let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let bounds = dst.bounds(); + let slice = file + .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) + .await?; + Slice::from_buf_bounds(Slice::into_inner(slice), bounds) + } else { + dst + }; + + let dst = if buffered_range.len() > 0 { + let offset_in_buffer = buffered_range + .0 + .checked_sub(flushed_offset) + .unwrap() + .into_usize(); + let to_copy = + &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = written_range.len().into_usize(); + let end = start + .checked_add(buffered_range.len().into_usize()) + .unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs + + Ok((dst, (end - start).into_usize())) + } +} + /// Does the given filename look like an ephemeral file? pub fn is_ephemeral_file(filename: &str) -> bool { if let Some(rest) = filename.strip_prefix("ephemeral-") { @@ -114,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl BlockReader for EphemeralFile { - fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { - BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) - } -} - #[cfg(test)] mod tests { + use rand::Rng; + use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::BlockReaderRef; - use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; @@ -157,69 +282,6 @@ mod tests { Ok((conf, tenant_shard_id, timeline_id, ctx)) } - #[tokio::test] - async fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - - let gate = utils::sync::gate::Gate::default(); - - let entered = gate.enter().unwrap(); - - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?; - - let pos_foo = file.write_blob(b"foo", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - let pos_bar = file.write_blob(b"bar", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - assert_eq!( - b"bar", - file.block_cursor() - .read_blob(pos_bar, &ctx) - .await? - .as_slice() - ); - - let mut blobs = Vec::new(); - for i in 0..10000 { - let data = Vec::from(format!("blob{}", i).as_bytes()); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); - } - // also test with a large blobs - for i in 0..100 { - let data = format!("blob{}", i).as_bytes().repeat(100); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); - } - - let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file)); - for (pos, expected) in blobs { - let actual = cursor.read_blob(pos, &ctx).await?; - assert_eq!(actual, expected); - } - - // Test a large blob that spans multiple pages - let mut large_data = vec![0; 20000]; - thread_rng().fill_bytes(&mut large_data); - let pos_large = file.write_blob(&large_data, &ctx).await?; - let result = file.block_cursor().read_blob(pos_large, &ctx).await?; - assert_eq!(result, large_data); - - Ok(()) - } - #[tokio::test] async fn ephemeral_file_holds_gate_open() { const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); @@ -253,4 +315,151 @@ mod tests { .expect("closing completes right away") .expect("closing does not panic"); } + + #[tokio::test] + async fn test_ephemeral_file_basics() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let write_nbytes = cap + cap / 2; + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(write_nbytes) + .collect(); + + let mut value_offsets = Vec::new(); + for i in 0..write_nbytes { + let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + value_offsets.push(off); + } + + assert!(file.len() as usize == write_nbytes); + for i in 0..write_nbytes { + assert_eq!(value_offsets[i], i.into_u64()); + let buf = Vec::with_capacity(1); + let (buf_slice, nread) = file + .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .await + .unwrap(); + let buf = buf_slice.into_inner(); + assert_eq!(nread, 1); + assert_eq!(&buf, &content[i..i + 1]); + } + + let file_contents = + std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap(); + assert_eq!(file_contents, &content[0..cap]); + + let buffer_contents = file.buffered_writer.inspect_buffer(); + assert_eq!(buffer_contents, &content[cap..write_nbytes]); + } + + #[tokio::test] + async fn test_flushes_do_happen() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + // assert the state is as this test expects it to be + assert_eq!( + &file.load_to_vec(&ctx).await.unwrap(), + &content[0..cap + cap / 2] + ); + let md = file + .buffered_writer + .as_inner() + .as_inner() + .path + .metadata() + .unwrap(); + assert_eq!( + md.len(), + cap.into_u64(), + "buffered writer does one write if we write 1.5x buffer capacity" + ); + assert_eq!( + &file.buffered_writer.inspect_buffer()[0..cap / 2], + &content[cap..cap + cap / 2] + ); + } + + #[tokio::test] + async fn test_read_split_across_file_and_buffer() { + // This test exercises the logic on the read path that splits the logical read + // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer. + // + // This test build on the assertions in test_flushes_do_happen + + let (conf, tenant_id, timeline_id, ctx) = + harness("test_read_split_across_file_and_buffer").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + let test_read = |start: usize, len: usize| { + let file = &file; + let ctx = &ctx; + let content = &content; + async move { + let (buf, nread) = file + .read_exact_at_eof_ok( + start.into_u64(), + Vec::with_capacity(len).slice_full(), + ctx, + ) + .await + .unwrap(); + assert_eq!(nread, len); + assert_eq!(&buf.into_inner(), &content[start..(start + len)]); + } + }; + + // completely within the file range + assert!(20 < cap, "test assumption"); + test_read(10, 10).await; + // border onto edge of file + test_read(cap - 10, 10).await; + // read across file and buffer + test_read(cap - 10, 20).await; + // stay from start of buffer + test_read(cap, 10).await; + // completely within buffer + test_read(cap + 10, 10).await; + } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs deleted file mode 100644 index 48926354f1..0000000000 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the -//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. -//! -//! Subject to removal in - -use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; -use crate::tenant::block_io::BlockLease; -use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; -use crate::virtual_file::VirtualFile; - -use std::io::{self}; -use tokio_epoll_uring::BoundedBuf; -use tracing::*; - -use super::zero_padded_read_write; - -/// See module-level comment. -pub struct RW { - page_cache_file_id: page_cache::FileId, - rw: super::zero_padded_read_write::RW>, - /// Gate guard is held on as long as we need to do operations in the path (delete on drop). - _gate_guard: utils::sync::gate::GateGuard, -} - -impl RW { - pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self { - let page_cache_file_id = page_cache::next_file_id(); - Self { - page_cache_file_id, - rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)), - _gate_guard, - } - } - - pub fn page_cache_file_id(&self) -> page_cache::FileId { - self.page_cache_file_id - } - - pub(crate) async fn write_all_borrowed( - &mut self, - srcbuf: &[u8], - ctx: &RequestContext, - ) -> Result { - // It doesn't make sense to proactively fill the page cache on the Pageserver write path - // because Compute is unlikely to access recently written data. - self.rw.write_all_borrowed(srcbuf, ctx).await - } - - pub(crate) fn bytes_written(&self) -> u64 { - self.rw.bytes_written() - } - - /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. - /// - /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. - /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. - pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { - // round up to the next PAGE_SZ multiple, required by blob_io - let size = { - let s = usize::try_from(self.bytes_written()).unwrap(); - if s % PAGE_SZ == 0 { - s - } else { - s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() - } - }; - let vec = Vec::with_capacity(size); - - // read from disk what we've already flushed - let file_size_tracking_writer = self.rw.as_writer(); - let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap(); - let mut vec = file_size_tracking_writer - .as_inner() - .read_exact_at( - vec.slice(0..(flushed_range.end - flushed_range.start)), - u64::try_from(flushed_range.start).unwrap(), - ctx, - ) - .await? - .into_inner(); - - // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk - let buffered = self.rw.get_tail_zero_padded(); - vec.extend_from_slice(buffered); - assert_eq!(vec.len(), size); - assert_eq!(vec.len() % PAGE_SZ, 0); - Ok(vec) - } - - pub(crate) async fn read_blk( - &self, - blknum: u32, - ctx: &RequestContext, - ) -> Result { - match self.rw.read_blk(blknum).await? { - zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, - self.rw.as_writer().as_inner().path, - e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(write_guard) => { - let write_guard = writer - .as_inner() - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - } - } - zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => { - Ok(BlockLease::EphemeralFileMutableTail(buffer)) - } - } - } -} - -impl Drop for RW { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - // we are clear to do this, because we have entered a gate - let path = &self.rw.as_writer().as_inner().path; - let res = std::fs::remove_file(path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!("could not remove ephemeral file '{path}': {e}"); - } - } - } -} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs deleted file mode 100644 index fe310acab8..0000000000 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ /dev/null @@ -1,145 +0,0 @@ -//! The heart of how [`super::EphemeralFile`] does its reads and writes. -//! -//! # Writes -//! -//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`]. -//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`]. -//! -//! # Reads -//! -//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`]. -//! -//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer -//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`] -//! if the read is for the prefix that has already been flushed. -//! -//! # Current Usage -//! -//! The current user of this module is [`super::page_caching::RW`]. - -mod zero_padded; - -use crate::{ - context::RequestContext, - page_cache::PAGE_SZ, - virtual_file::owned_buffers_io::{ - self, - write::{Buffer, OwnedAsyncWriter}, - }, -}; - -const TAIL_SZ: usize = 64 * 1024; - -/// See module-level comment. -pub struct RW { - buffered_writer: owned_buffers_io::write::BufferedWriter< - zero_padded::Buffer, - owned_buffers_io::util::size_tracking_writer::Writer, - >, -} - -pub enum ReadResult<'a, W> { - NeedsReadFromWriter { writer: &'a W }, - ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] }, -} - -impl RW -where - W: OwnedAsyncWriter, -{ - pub fn new(writer: W) -> Self { - let bytes_flushed_tracker = - owned_buffers_io::util::size_tracking_writer::Writer::new(writer); - let buffered_writer = owned_buffers_io::write::BufferedWriter::new( - bytes_flushed_tracker, - zero_padded::Buffer::default(), - ); - Self { buffered_writer } - } - - pub(crate) fn as_writer(&self) -> &W { - self.buffered_writer.as_inner().as_inner() - } - - pub async fn write_all_borrowed( - &mut self, - buf: &[u8], - ctx: &RequestContext, - ) -> std::io::Result { - self.buffered_writer.write_buffered_borrowed(buf, ctx).await - } - - pub fn bytes_written(&self) -> u64 { - let flushed_offset = self.buffered_writer.as_inner().bytes_written(); - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - flushed_offset + u64::try_from(buffer.pending()).unwrap() - } - - /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. - pub fn get_tail_zero_padded(&self) -> &[u8] { - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - let buffer_written_up_to = buffer.pending(); - // pad to next page boundary - let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { - buffer_written_up_to - } else { - buffer_written_up_to - .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) - .unwrap() - }; - &buffer.as_zero_padded_slice()[0..read_up_to] - } - - pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { - let flushed_offset = self.buffered_writer.as_inner().bytes_written(); - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap(); - let read_offset = (blknum as u64) * (PAGE_SZ as u64); - - // The trailing page ("block") might only be partially filled, - // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway. - // Moreover, it has to be zero-padded, because when we still had - // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it. - // DeltaLayer probably has the same issue, not sure why it needs no special treatment. - // => check here that the read doesn't go beyond this potentially trailing - // => the zero-padding is done in the `else` branch below - let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 { - buffered_offset / (PAGE_SZ as u64) - } else { - (buffered_offset / (PAGE_SZ as u64)) + 1 - }; - if (blknum as u64) >= blocks_written { - return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"))); - } - - // assertions for the `if-else` below - assert_eq!( - flushed_offset % (TAIL_SZ as u64), 0, - "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks" - ); - assert_eq!( - flushed_offset % (PAGE_SZ as u64), - 0, - "the logic below can't handle if the page is spread across the flushed part and the buffer" - ); - - if read_offset < flushed_offset { - assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset); - Ok(ReadResult::NeedsReadFromWriter { - writer: self.as_writer(), - }) - } else { - let read_offset_in_buffer = read_offset - .checked_sub(flushed_offset) - .expect("would have taken `if` branch instead of this one"); - let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap(); - let zero_padded_slice = buffer.as_zero_padded_slice(); - let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)]; - Ok(ReadResult::ServedFromZeroPaddedMutableTail { - buffer: page - .try_into() - .expect("the slice above got it as page-size slice"), - }) - } - } -} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs deleted file mode 100644 index 2dc0277638..0000000000 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ /dev/null @@ -1,110 +0,0 @@ -//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose -//! unwritten range is guaranteed to be zero-initialized. -//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`] -//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled. - -use std::mem::MaybeUninit; - -use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; - -/// See module-level comment. -pub struct Buffer { - allocation: Box<[u8; N]>, - written: usize, -} - -impl Default for Buffer { - fn default() -> Self { - Self { - allocation: Box::new( - // SAFETY: zeroed memory is a valid [u8; N] - unsafe { MaybeUninit::zeroed().assume_init() }, - ), - written: 0, - } - } -} - -impl Buffer { - #[inline(always)] - fn invariants(&self) { - // don't check by default, unoptimized is too expensive even for debug mode - if false { - debug_assert!(self.written <= N, "{}", self.written); - debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0)); - } - } - - pub fn as_zero_padded_slice(&self) -> &[u8; N] { - &self.allocation - } -} - -impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { - type IoBuf = Self; - - fn cap(&self) -> usize { - self.allocation.len() - } - - fn extend_from_slice(&mut self, other: &[u8]) { - self.invariants(); - let remaining = self.allocation.len() - self.written; - if other.len() > remaining { - panic!("calling extend_from_slice() with insufficient remaining capacity"); - } - self.allocation[self.written..(self.written + other.len())].copy_from_slice(other); - self.written += other.len(); - self.invariants(); - } - - fn pending(&self) -> usize { - self.written - } - - fn flush(self) -> FullSlice { - self.invariants(); - let written = self.written; - FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written)) - } - - fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { - let Self { - mut allocation, - written, - } = iobuf; - allocation[0..written].fill(0); - let new = Self { - allocation, - written: 0, - }; - new.invariants(); - new - } -} - -/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a -/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data. -/// -/// Remember that bytes_init is generally _not_ a tracker of the amount -/// of valid data in the io buffer; we use `Slice` for that. -/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit. -/// -/// SAFETY: -/// -/// The [`Self::allocation`] is stable becauses boxes are stable. -/// The memory is zero-initialized, so, bytes_init is always N. -unsafe impl tokio_epoll_uring::IoBuf for Buffer { - fn stable_ptr(&self) -> *const u8 { - self.allocation.as_ptr() - } - - fn bytes_init(&self) -> usize { - // Yes, N, not self.written; Read the full comment of this impl block! - N - } - - fn bytes_total(&self) -> usize { - N - } -} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 844f117ea2..707233b003 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -464,7 +464,7 @@ impl LayerMap { pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer_desc.key_range) { + if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); } @@ -483,7 +483,7 @@ impl LayerMap { self.historic .remove(historic_layer_coverage::LayerKey::from(layer_desc)); let layer_key = layer_desc.key(); - if Self::is_l0(&layer_desc.key_range) { + if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); @@ -600,8 +600,8 @@ impl LayerMap { } /// Check if the key range resembles that of an L0 layer. - pub fn is_l0(key_range: &Range) -> bool { - key_range == &(Key::MIN..Key::MAX) + pub fn is_l0(key_range: &Range, is_delta_layer: bool) -> bool { + is_delta_layer && key_range == &(Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -628,7 +628,7 @@ impl LayerMap { /// than just the current partition_range. pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 - if !Self::is_l0(&layer.key_range) { + if !Self::is_l0(&layer.key_range, layer.is_delta) { return true; } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 190316df42..24440d4b35 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,7 +1,8 @@ -//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in -//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines, -//! this struct and it's original serialization format is still needed because they were written a -//! long time ago. +//! Describes the legacy now hopefully no longer modified per-timeline metadata. +//! +//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and +//! their timelines, this struct and its original serialization format is still needed because +//! they were written a long time ago. //! //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json //! versioning. diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4e6ea0c8f9..2104f41531 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -282,9 +282,10 @@ impl BackgroundPurges { static TENANTS: Lazy> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); -/// The TenantManager is responsible for storing and mutating the collection of all tenants -/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance -/// lives inside the TenantManager. +/// Responsible for storing and mutating the collection of all tenants +/// that this pageserver has state for. +/// +/// Every Tenant and SecondaryTenant instance lives inside the TenantManager. /// /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach /// the same tenant twice concurrently, or trying to configure the same tenant into secondary @@ -2346,8 +2347,9 @@ pub enum TenantMapError { ShuttingDown, } -/// Guards a particular tenant_id's content in the TenantsMap. While this -/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] +/// Guards a particular tenant_id's content in the TenantsMap. +/// +/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] /// for this tenant, which acts as a marker for any operations targeting /// this tenant to retry later, or wait for the InProgress state to end. /// diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 71b766e4c7..1f9ae40af5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2184,6 +2184,8 @@ pub fn remote_timeline_path( remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } +/// Obtains the path of the given Layer in the remote +/// /// Note that the shard component of a remote layer path is _not_ always the same /// as in the TenantShardId of the caller: tenants may reference layers from a different /// ShardIndex. Use the ShardIndex from the layer's metadata. diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d9725ad756..9fbe2f0da5 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst( cancel, ) .await - .map_err(|e| { + .inspect_err(|_e| { // Do a best-effort attempt at deleting the temporary file upon encountering an error. // We don't have async here nor do we want to pile on any extra errors. if let Err(e) = std::fs::remove_file(&temp_path) { @@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst( warn!("error deleting temporary file {temp_path}: {e}"); } } - e })?; Ok((temp_path, file)) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 757fb9d032..c51ff54919 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -1,4 +1,5 @@ //! In-memory index to track the tenant files on the remote storage. +//! //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 04f89db401..dac6b2f893 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,13 +2,12 @@ pub mod delta_layer; pub mod image_layer; -pub(crate) mod inmemory_layer; +pub mod inmemory_layer; pub(crate) mod layer; mod layer_desc; mod layer_name; pub mod merge_iterator; -#[cfg(test)] pub mod split_writer; use crate::context::{AccessStatsBehavior, RequestContext}; @@ -435,10 +434,11 @@ impl ReadableLayer { } } -/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather -/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility -/// of layers (for example when creating a branch that makes some previously covered layers visible). It should -/// be used for cache management but not for correctness-critical checks. +/// Layers contain a hint indicating whether they are likely to be used for reads. +/// +/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously +/// when changing the visibility of layers (for example when creating a branch that makes some previously +/// covered layers visible). It should be used for cache management but not for correctness-critical checks. #[derive(Debug, Clone, PartialEq, Eq)] pub enum LayerVisibilityHint { /// A Visible layer might be read while serving a read, because there is not an image layer between it diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6c2391d72d..34f1b15138 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -36,10 +36,11 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; +use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadPlanner, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadCoalesceMode, VectoredReadPlanner, }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; @@ -51,6 +52,7 @@ use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; @@ -64,7 +66,7 @@ use std::os::unix::fs::FileExt; use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::IoBuf; use tracing::*; use utils::{ @@ -134,10 +136,11 @@ impl Summary { // Flag indicating that this version initialize the page const WILL_INIT: u64 = 1; -/// Struct representing reference to BLOB in layers. Reference contains BLOB -/// offset, and for WAL records it also contains `will_init` flag. The flag -/// helps to determine the range of records that needs to be applied, without -/// reading/deserializing records themselves. +/// Struct representing reference to BLOB in layers. +/// +/// Reference contains BLOB offset, and for WAL records it also contains +/// `will_init` flag. The flag helps to determine the range of records +/// that needs to be applied, without reading/deserializing records themselves. #[derive(Debug, Serialize, Deserialize, Copy, Clone)] pub struct BlobRef(pub u64); @@ -224,14 +227,24 @@ pub struct DeltaLayerInner { file: VirtualFile, file_id: FileId, - #[allow(dead_code)] layer_key_range: Range, - #[allow(dead_code)] layer_lsn_range: Range, max_vectored_read_bytes: Option, } +impl DeltaLayerInner { + pub(crate) fn layer_dbg_info(&self) -> String { + format!( + "delta {}..{} {}..{}", + self.key_range().start, + self.key_range().end, + self.lsn_range().start, + self.lsn_range().end + ) + } +} + impl std::fmt::Debug for DeltaLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DeltaLayerInner") @@ -458,7 +471,7 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> (FullSlice, anyhow::Result<()>) where - Buf: IoBufMut + Send, + Buf: IoBuf + Send, { assert!( self.lsn_range.start <= lsn, @@ -556,7 +569,6 @@ impl DeltaLayerWriterInner { // 5GB limit for objects without multipart upload (which we don't want to use) // Make it a little bit below to account for differing GB units // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html - const S3_UPLOAD_LIMIT: u64 = 4_500_000_000; ensure!( metadata.len() <= S3_UPLOAD_LIMIT, "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!", @@ -666,7 +678,7 @@ impl DeltaLayerWriter { ctx: &RequestContext, ) -> (FullSlice, anyhow::Result<()>) where - Buf: IoBufMut + Send, + Buf: IoBuf + Send, { self.inner .as_mut() @@ -690,12 +702,10 @@ impl DeltaLayerWriter { self.inner.take().unwrap().finish(key_end, ctx).await } - #[cfg(test)] pub(crate) fn num_keys(&self) -> usize { self.inner.as_ref().unwrap().num_keys } - #[cfg(test)] pub(crate) fn estimated_size(&self) -> u64 { let inner = self.inner.as_ref().unwrap(); inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 @@ -872,44 +882,6 @@ impl DeltaLayerInner { Ok(()) } - /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - pub(super) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - block_reader, - ); - let mut result = Vec::new(); - let mut stream = - Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx)); - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let cursor = block_reader.block_cursor(); - let mut buf = Vec::new(); - while let Some(item) = stream.next().await { - let (key, lsn, pos) = item?; - // TODO: dedup code with get_reconstruct_value - // TODO: ctx handling and sharding - cursor - .read_blob_into_buf(pos.pos(), &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", self.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - self.file.path - ) - })?; - result.push((key, lsn, val)); - } - Ok(result) - } - async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, @@ -1195,6 +1167,7 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; let mut read_builder: Option = None; + let read_mode = VectoredReadCoalesceMode::get(); let max_read_size = self .max_vectored_read_bytes @@ -1243,6 +1216,7 @@ impl DeltaLayerInner { offsets.end.pos(), meta, max_read_size, + read_mode, )) } } else { @@ -1527,6 +1501,10 @@ pub struct DeltaLayerIterator<'a> { } impl<'a> DeltaLayerIterator<'a> { + pub(crate) fn layer_dbg_info(&self) -> String { + self.delta_layer.layer_dbg_info() + } + /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { assert!(self.key_values_batch.is_empty()); @@ -2281,7 +2259,7 @@ pub(crate) mod test { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { - assert_eq!(iter.key_values_batch.len(), batch_size); + assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 9a19e4e2c7..5de2582ab7 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1,7 +1,9 @@ //! An ImageLayer represents an image or a snapshot of a key-range at -//! one particular LSN. It contains an image of all key-value pairs -//! in its key-range. Any key that falls into the image layer's range -//! but does not exist in the layer, does not exist. +//! one particular LSN. +//! +//! It contains an image of all key-value pairs in its key-range. Any key +//! that falls into the image layer's range but does not exist in the layer, +//! does not exist. //! //! An image layer is stored in a file on disk. The file is stored in //! timelines/ directory. Currently, there are no @@ -28,16 +30,15 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; -use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::block_io::{BlockBuf, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadPlanner, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; @@ -46,6 +47,7 @@ use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; @@ -56,7 +58,6 @@ use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; -use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tracing::*; @@ -68,9 +69,7 @@ use utils::{ }; use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -167,6 +166,17 @@ pub struct ImageLayerInner { max_vectored_read_bytes: Option, } +impl ImageLayerInner { + pub(crate) fn layer_dbg_info(&self) -> String { + format!( + "image {}..{} {}", + self.key_range().start, + self.key_range().end, + self.lsn() + ) + } +} + impl std::fmt::Debug for ImageLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ImageLayerInner") @@ -442,33 +452,6 @@ impl ImageLayerInner { Ok(()) } - /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - pub(super) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = - DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); - let mut result = Vec::new(); - let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx)); - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let cursor = block_reader.block_cursor(); - while let Some(item) = stream.next().await { - // TODO: dedup code with get_reconstruct_value - let (raw_key, offset) = item?; - let key = Key::from_slice(&raw_key[..KEY_SIZE]); - // TODO: ctx handling and sharding - let blob = cursor - .read_blob(offset, ctx) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - result.push((key, self.lsn, Value::Image(value))); - } - Ok(result) - } - /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// @@ -700,15 +683,11 @@ struct ImageLayerWriterInner { blob_writer: BlobWriter, tree: DiskBtreeBuilder, - #[cfg_attr(not(feature = "testing"), allow(dead_code))] + #[cfg(feature = "testing")] last_written_key: Key, } impl ImageLayerWriterInner { - fn size(&self) -> u64 { - self.tree.borrow_writer().size() + self.blob_writer.size() - } - /// /// Start building a new image layer. /// @@ -763,6 +742,7 @@ impl ImageLayerWriterInner { uncompressed_bytes_eligible: 0, uncompressed_bytes_chosen: 0, num_keys: 0, + #[cfg(feature = "testing")] last_written_key: Key::MIN, }; @@ -817,10 +797,9 @@ impl ImageLayerWriterInner { /// async fn finish( self, - timeline: &Arc, ctx: &RequestContext, end_key: Option, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -843,13 +822,19 @@ impl ImageLayerWriterInner { res?; } + let final_key_range = if let Some(end_key) = end_key { + self.key_range.start..end_key + } else { + self.key_range.clone() + }; + // Fill in the summary on blk 0 let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, - key_range: self.key_range.clone(), + key_range: final_key_range.clone(), lsn: self.lsn, index_start_blk, index_root_blk, @@ -870,11 +855,7 @@ impl ImageLayerWriterInner { let desc = PersistentLayerDesc::new_img( self.tenant_shard_id, self.timeline_id, - if let Some(end_key) = end_key { - self.key_range.start..end_key - } else { - self.key_range.clone() - }, + final_key_range, self.lsn, metadata.len(), ); @@ -894,12 +875,9 @@ impl ImageLayerWriterInner { // fsync the file file.sync_all().await?; - // FIXME: why not carry the virtualfile here, it supports renaming? - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created image layer {}", self.path); - info!("created image layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -963,14 +941,12 @@ impl ImageLayerWriter { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } - #[cfg(test)] /// Estimated size of the image layer. pub(crate) fn estimated_size(&self) -> u64 { let inner = self.inner.as_ref().unwrap(); inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64 } - #[cfg(test)] pub(crate) fn num_keys(&self) -> usize { self.inner.as_ref().unwrap().num_keys } @@ -980,29 +956,18 @@ impl ImageLayerWriter { /// pub(crate) async fn finish( mut self, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline, ctx, None).await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, None).await } - #[cfg(test)] /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. pub(super) async fn finish_with_end_key( mut self, - timeline: &Arc, end_key: Key, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(timeline, ctx, Some(end_key)) - .await - } - - pub(crate) fn size(&self) -> u64 { - self.inner.as_ref().unwrap().size() + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, Some(end_key)).await } } @@ -1024,6 +989,10 @@ pub struct ImageLayerIterator<'a> { } impl<'a> ImageLayerIterator<'a> { + pub(crate) fn layer_dbg_info(&self) -> String { + self.image_layer.layer_dbg_info() + } + /// Retrieve a batch of key-value pairs into the iterator buffer. async fn next_batch(&mut self) -> anyhow::Result<()> { assert!(self.key_values_batch.is_empty()); @@ -1102,7 +1071,7 @@ mod test { tenant::{ config::TenantConf, harness::{TenantHarness, TIMELINE_ID}, - storage_layer::ResidentLayer, + storage_layer::{Layer, ResidentLayer}, vectored_blob_io::StreamingVectoredReadPlanner, Tenant, Timeline, }, @@ -1173,7 +1142,8 @@ mod test { key = key.next(); } - writer.finish(&timeline, &ctx).await.unwrap() + let (desc, path) = writer.finish(&ctx).await.unwrap(); + Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap() }; let original_size = resident.metadata().file_size; @@ -1235,7 +1205,9 @@ mod test { .await .unwrap(); let replacement = if wrote_keys > 0 { - Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + let (desc, path) = filtered_writer.finish(&ctx).await.unwrap(); + let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap(); + Some(resident) } else { None }; @@ -1308,7 +1280,8 @@ mod test { for (key, img) in images { writer.put_image(key, img, ctx).await?; } - let img_layer = writer.finish(tline, ctx).await?; + let (desc, path) = writer.finish(ctx).await?; + let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(img_layer) } @@ -1375,7 +1348,7 @@ mod test { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { - assert_eq!(iter.key_values_batch.len(), batch_size); + assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 130d1002a0..e487bee1f2 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,23 +4,23 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! +use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; -use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::timeline::GetVectoredError; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; +use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; @@ -33,12 +33,14 @@ use std::fmt::Write; use std::ops::Range; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::{RwLock, RwLockWriteGuard}; +use tokio::sync::RwLock; use super::{ DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, }; +pub(crate) mod vectored_dio_read; + #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct InMemoryLayerFileId(page_cache::FileId); @@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer { pub struct InMemoryLayerInner { /// All versions of all pages in the layer are kept here. Indexed - /// by block number and LSN. The value is an offset into the + /// by block number and LSN. The [`IndexEntry`] is an offset into the /// ephemeral file where the page version is stored. - index: BTreeMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -90,6 +92,154 @@ pub struct InMemoryLayerInner { resource_units: GlobalResourceUnits, } +/// Support the same max blob length as blob_io, because ultimately +/// all the InMemoryLayer contents end up being written into a delta layer, +/// using the [`crate::tenant::blob_io`]. +const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN; +const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { + let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize; + let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize; + assert!(trailing_ones + leading_zeroes == std::mem::size_of::() * 8); + trailing_ones +}; + +/// See [`InMemoryLayerInner::index`]. +/// +/// For memory efficiency, the data is packed into a u64. +/// +/// Layout: +/// - 1 bit: `will_init` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct IndexEntry(u64); + +impl IndexEntry { + /// See [`Self::MAX_SUPPORTED_POS`]. + const MAX_SUPPORTED_POS_BITS: usize = { + let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS; + if remainder < 32 { + panic!("pos can be u32 as per type system, support that"); + } + remainder + }; + /// The maximum supported blob offset that can be represented by [`Self`]. + /// See also [`Self::validate_checkpoint_distance`]. + const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1; + + // Layout + const WILL_INIT_RANGE: Range = 0..1; + const LEN_RANGE: Range = + Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS; + const POS_RANGE: Range = + Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS; + const _ASSERT: () = { + if Self::POS_RANGE.end != 64 { + panic!("we don't want undefined bits for our own sanity") + } + }; + + /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`]. + /// + /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long. + /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`]. + /// + /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance, + /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value. + /// + /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested) + /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer. + #[inline(always)] + fn new(arg: IndexEntryNewArgs) -> anyhow::Result { + let IndexEntryNewArgs { + base_offset, + batch_offset, + len, + will_init, + } = arg; + + let pos = base_offset + .checked_add(batch_offset) + .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?; + + if pos.into_usize() > Self::MAX_SUPPORTED_POS { + anyhow::bail!( + "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}", + max = Self::MAX_SUPPORTED_POS + ); + } + + if len > MAX_SUPPORTED_BLOB_LEN { + anyhow::bail!( + "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}", + ); + } + + let mut data: u64 = 0; + use bit_field::BitField; + data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 }); + data.set_bits(Self::LEN_RANGE, len.into_u64()); + data.set_bits(Self::POS_RANGE, pos); + + Ok(Self(data)) + } + + #[inline(always)] + fn unpack(&self) -> IndexEntryUnpacked { + use bit_field::BitField; + IndexEntryUnpacked { + will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0, + len: self.0.get_bits(Self::LEN_RANGE), + pos: self.0.get_bits(Self::POS_RANGE), + } + } + + /// See [`Self::new`]. + pub(crate) const fn validate_checkpoint_distance( + checkpoint_distance: u64, + ) -> Result<(), &'static str> { + if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 { + return Err("exceeds the maximum supported value"); + } + let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN); + if res.is_none() { + return Err( + "checkpoint distance + max supported blob len overflows in-memory addition", + ); + } + + // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS + + Ok(()) + } + + const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = { + let res = Self::validate_checkpoint_distance( + pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE, + ); + if res.is_err() { + panic!("default checkpoint distance is valid") + } + }; +} + +/// Args to [`IndexEntry::new`]. +#[derive(Clone, Copy)] +struct IndexEntryNewArgs { + base_offset: u64, + batch_offset: u64, + len: usize, + will_init: bool, +} + +/// Unpacked representation of the bitfielded [`IndexEntry`]. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +struct IndexEntryUnpacked { + will_init: bool, + len: u64, + pos: u64, +} + impl std::fmt::Debug for InMemoryLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayerInner").finish() @@ -276,7 +426,12 @@ impl InMemoryLayer { .build(); let inner = self.inner.read().await; - let reader = inner.file.block_cursor(); + + struct ValueRead { + entry_lsn: Lsn, + read: vectored_dio_read::LogicalRead>, + } + let mut reads: HashMap> = HashMap::new(); for range in keyspace.ranges.iter() { for (key, vec_map) in inner @@ -291,24 +446,62 @@ impl InMemoryLayer { let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 - let buf = reader.read_blob(*pos, &ctx).await; - if let Err(e) = buf { - reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); + for (entry_lsn, index_entry) in slice.iter().rev() { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = index_entry.unpack(); + reads.entry(key).or_default().push(ValueRead { + entry_lsn: *entry_lsn, + read: vectored_dio_read::LogicalRead::new( + pos, + Vec::with_capacity(len as usize), + ), + }); + if will_init { break; } + } + } + } - let value = Value::des(&buf.unwrap()); - if let Err(e) = value { + // Execute the reads. + + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; + + // Process results into the reconstruct state + 'next_key: for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + match read.into_result().expect("we run execute() above") { + Err(e) => { reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); - break; + continue 'next_key; } + Ok(value_buf) => { + let value = Value::des(&value_buf); + if let Err(e) = value { + reconstruct_state + .on_key_error(key, PageReconstructError::from(anyhow!(e))); + continue 'next_key; + } - let key_situation = - reconstruct_state.update_key(&key, *entry_lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - break; + let key_situation = + reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + // TODO: metric to see if we fetched more values than necessary + continue 'next_key; + } + + // process the next value in the next iteration of the loop } } } @@ -320,6 +513,68 @@ impl InMemoryLayer { } } +/// Offset of a particular Value within a serialized batch. +struct SerializedBatchOffset { + key: CompactKey, + lsn: Lsn, + // TODO: separate type when we start serde-serializing this value, to avoid coupling + // in-memory representation to serialization format. + index_entry: IndexEntry, +} + +pub struct SerializedBatch { + /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`]. + pub(crate) raw: Vec, + + /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer. + offsets: Vec, + + /// The highest LSN of any value in the batch + pub(crate) max_lsn: Lsn, +} + +impl SerializedBatch { + pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result { + // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by + // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`] + let buffer_size = batch.iter().map(|i| i.2).sum::(); + let mut cursor = std::io::Cursor::new(Vec::::with_capacity(buffer_size)); + + let mut offsets: Vec = Vec::with_capacity(batch.len()); + let mut max_lsn: Lsn = Lsn(0); + for (key, lsn, val_ser_size, val) in batch { + let relative_off = cursor.position(); + + val.ser_into(&mut cursor) + .expect("Writing into in-memory buffer is infallible"); + + offsets.push(SerializedBatchOffset { + key, + lsn, + index_entry: IndexEntry::new(IndexEntryNewArgs { + base_offset: 0, + batch_offset: relative_off, + len: val_ser_size, + will_init: val.will_init(), + }) + .context("higher-level code ensures that values are within supported ranges")?, + }); + max_lsn = std::cmp::max(max_lsn, lsn); + } + + let buffer = cursor.into_inner(); + + // Assert that we didn't do any extra allocations while building buffer. + debug_assert!(buffer.len() <= buffer_size); + + Ok(Self { + raw: buffer, + offsets, + max_lsn, + }) + } +} + fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) } @@ -380,53 +635,74 @@ impl InMemoryLayer { }) } - // Write operations - - /// Common subroutine of the public put_wal_record() and put_page_image() functions. - /// Adds the page version to the in-memory tree - pub async fn put_value( + /// Write path. + /// + /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. + /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. + /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. + pub async fn put_batch( &self, - key: CompactKey, - lsn: Lsn, - buf: &[u8], + serialized_batch: SerializedBatch, ctx: &RequestContext, - ) -> Result<()> { + ) -> anyhow::Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, buf, ctx).await - } - async fn put_value_locked( - &self, - locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, - key: CompactKey, - lsn: Lsn, - buf: &[u8], - ctx: &RequestContext, - ) -> Result<()> { - trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); + let base_offset = inner.file.len(); - let off = { - locked_inner - .file - .write_blob( - buf, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(), - ) - .await? - }; + let SerializedBatch { + raw, + mut offsets, + max_lsn: _, + } = serialized_batch; - let vec_map = locked_inner.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Key {} at {} already exists", key, lsn); + // Add the base_offset to the batch's index entries which are relative to the batch start. + for offset in &mut offsets { + let IndexEntryUnpacked { + will_init, + len, + pos, + } = offset.index_entry.unpack(); + offset.index_entry = IndexEntry::new(IndexEntryNewArgs { + base_offset, + batch_offset: pos, + len: len.into_usize(), + will_init, + })?; } - let size = locked_inner.file.len(); - locked_inner.resource_units.maybe_publish_size(size); + // Write the batch to the file + inner.file.write_raw(&raw, ctx).await?; + let new_size = inner.file.len(); + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); + + // Update the index with the new entries + for SerializedBatchOffset { + key, + lsn, + index_entry, + } in offsets + { + let vec_map = inner.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; + if old.is_some() { + // This should not break anything, but is unexpected: ingestion code aims to filter out + // multiple writes to the same key at the same LSN. This happens in cases where our + // ingenstion code generates some write like an empty page, and we see a write from postgres + // to the same key in the same wal record. If one such write makes it through, we + // index the most recent write, implicitly ignoring the earlier write. We log a warning + // because this case is unexpected, and we would like tests to fail if this happens. + warn!("Key {} at {} written twice at same LSN", key, lsn); + } + } + + inner.resource_units.maybe_publish_size(new_size); Ok(()) } @@ -470,7 +746,7 @@ impl InMemoryLayer { { let inner = self.inner.write().await; for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { + for (lsn, _) in vec_map.as_slice() { assert!(*lsn < end_lsn); } } @@ -534,36 +810,23 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { let file_contents: Vec = inner.file.load_to_vec(ctx).await?; - assert_eq!( - file_contents.len() % PAGE_SZ, - 0, - "needed by BlockReaderRef::Slice" - ); - assert_eq!(file_contents.len(), { - let written = usize::try_from(inner.file.len()).unwrap(); - if written % PAGE_SZ == 0 { - written - } else { - written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() - } - }); - let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); - - let mut buf = Vec::new(); + let file_contents = Bytes::from(file_contents); for (key, vec_map) in inner.index.iter() { // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - // TODO: once we have blob lengths in the in-memory index, we can - // 1. get rid of the blob_io / BlockReaderRef::Slice business and - // 2. load the file contents into a Bytes and - // 3. the use `Bytes::slice` to get the `buf` that is our blob - // 4. pass that `buf` into `put_value_bytes` - // => https://github.com/neondatabase/neon/issues/8183 - cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let (tmp, res) = delta_layer_writer + for (lsn, entry) in vec_map + .as_slice() + .iter() + .map(|(lsn, entry)| (lsn, entry.unpack())) + { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = entry; + let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize); + let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), *lsn, @@ -573,7 +836,6 @@ impl InMemoryLayer { ) .await; res?; - buf = tmp.into_raw_slice().into_inner(); } } } @@ -595,3 +857,134 @@ impl InMemoryLayer { Ok(Some((desc, path))) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_entry() { + const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; + use IndexEntryNewArgs as Args; + use IndexEntryUnpacked as Unpacked; + + let roundtrip = |args, expect: Unpacked| { + let res = IndexEntry::new(args).expect("this tests expects no errors"); + let IndexEntryUnpacked { + will_init, + len, + pos, + } = res.unpack(); + assert_eq!(will_init, expect.will_init); + assert_eq!(len, expect.len); + assert_eq!(pos, expect.pos); + }; + + // basic roundtrip + for pos in [0, MAX_SUPPORTED_POS] { + for len in [0, MAX_SUPPORTED_BLOB_LEN] { + for will_init in [true, false] { + let expect = Unpacked { + will_init, + len: len.into_u64(), + pos: pos.into_u64(), + }; + roundtrip( + Args { + will_init, + base_offset: pos.into_u64(), + batch_offset: 0, + len, + }, + expect, + ); + roundtrip( + Args { + will_init, + base_offset: 0, + batch_offset: pos.into_u64(), + len, + }, + expect, + ); + } + } + } + + // too-large len + let too_large = Args { + will_init: false, + len: MAX_SUPPORTED_BLOB_LEN + 1, + base_offset: 0, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + + // too-large pos + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() + 1, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64() + 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // too large (base_offset + batch_offset) + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() - 1, + batch_offset: MAX_SUPPORTED_POS.into_u64() - 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // valid special cases + // - area past the max supported pos that is accessible by len + for len in [1, MAX_SUPPORTED_BLOB_LEN] { + roundtrip( + Args { + will_init: false, + len, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 0, + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + roundtrip( + Args { + will_init: false, + len, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64(), + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + } + } +} diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs new file mode 100644 index 0000000000..0683e15659 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -0,0 +1,937 @@ +use std::{ + collections::BTreeMap, + sync::{Arc, RwLock}, +}; + +use itertools::Itertools; +use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; + +use crate::{ + assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, + context::RequestContext, +}; + +/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. +pub trait File: Send { + /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())` + /// and return the number of bytes read (let's call it `nread`). + /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes. + /// + /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`) + /// is if the file is shorter than `start+dst.len()`. + /// + /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an + /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. + /// + /// No guarantees are made about the remaining bytes in `dst` in case of a short read. + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)>; +} + +/// A logical read from [`File`]. See [`Self::new`]. +pub struct LogicalRead { + pos: u64, + state: RwLockRefCell>, +} + +enum LogicalReadState { + NotStarted(B), + Ongoing(B), + Ok(B), + Error(Arc), + Undefined, +} + +impl LogicalRead { + /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`. + pub fn new(pos: u64, buf: B) -> Self { + Self { + pos, + state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)), + } + } + pub fn into_result(self) -> Option>> { + match self.state.into_inner() { + LogicalReadState::Ok(buf) => Some(Ok(buf)), + LogicalReadState::Error(e) => Some(Err(e)), + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None, + LogicalReadState::Undefined => unreachable!(), + } + } +} + +/// The buffer into which a [`LogicalRead`] result is placed. +pub trait Buffer: std::ops::Deref { + /// Immutable. + fn cap(&self) -> usize; + /// Changes only through [`Self::extend_from_slice`]. + fn len(&self) -> usize; + /// Panics if the total length would exceed the initialized capacity. + fn extend_from_slice(&mut self, src: &[u8]); +} + +/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO. +const DIO_CHUNK_SIZE: usize = 512; + +/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`. +/// (The unit is the number of chunks.) +const MAX_CHUNK_BATCH_SIZE: usize = { + let desired = 128 * 1024; // 128k + if desired % DIO_CHUNK_SIZE != 0 { + panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE") + // compile-time error + } + desired / DIO_CHUNK_SIZE +}; + +/// Execute the given logical `reads` against `file`. +/// The results are placed in the buffers of the [`LogicalRead`]s. +/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`]. +/// +/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function. +/// Otherwise, this function panics. +pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext) +where + I: IntoIterator>, + F: File, + B: Buffer + IoBufMut + Send, +{ + // Terminology: + // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity + // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges + // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests + // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity + + // Preserve a copy of the logical reads for debug assertions at the end + #[cfg(debug_assertions)] + let (reads, assert_logical_reads) = { + let (reads, assert) = reads.into_iter().tee(); + (reads, Some(Vec::from_iter(assert))) + }; + #[cfg(not(debug_assertions))] + let (reads, assert_logical_reads): (_, Option>>) = (reads, None); + + // Plan which parts of which chunks need to be appended to which buffer + let mut by_chunk: BTreeMap>> = BTreeMap::new(); + struct Interest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_chunk: u64, + len: u64, + } + for logical_read in reads { + let LogicalRead { pos, state } = logical_read; + let mut state = state.borrow_mut(); + + // transition from NotStarted to Ongoing + let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined); + let req_len = match cur { + LogicalReadState::NotStarted(buf) => { + if buf.len() != 0 { + panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + } + // buf.cap() == 0 is ok + + // transition into Ongoing state + let req_len = buf.cap(); + *state = LogicalReadState::Ongoing(buf); + req_len + } + x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + }; + + // plan which chunks we need to read from + let mut remaining = req_len; + let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64()); + let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE; + while remaining > 0 { + let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk); + by_chunk.entry(chunk_no).or_default().push(Interest { + logical_read, + offset_in_chunk: offset_in_chunk.into_u64(), + len: remaining_in_chunk.into_u64(), + }); + offset_in_chunk = 0; + chunk_no += 1; + remaining -= remaining_in_chunk; + } + } + + // At this point, we could iterate over by_chunk, in chunk order, + // read each chunk from disk, and fill the buffers. + // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE + // so we issue fewer IOs = fewer roundtrips = lower overall latency. + struct PhysicalRead<'a, B: Buffer> { + start_chunk_no: u64, + nchunks: usize, + dsts: Vec>, + } + struct PhysicalInterest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_physical_read: u64, + len: u64, + } + let mut physical_reads: Vec> = Vec::new(); + let mut by_chunk = by_chunk.into_iter().peekable(); + loop { + let mut last_chunk_no = None; + let to_merge: Vec<(u64, Vec>)> = by_chunk + .peeking_take_while(|(chunk_no, _)| { + if let Some(last_chunk_no) = last_chunk_no { + if *chunk_no != last_chunk_no + 1 { + return false; + } + } + last_chunk_no = Some(*chunk_no); + true + }) + .take(MAX_CHUNK_BATCH_SIZE) + .collect(); // TODO: avoid this .collect() + let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else { + break; + }; + let nchunks = to_merge.len(); + let dsts = to_merge + .into_iter() + .enumerate() + .flat_map(|(i, (_, dsts))| { + dsts.into_iter().map( + move |Interest { + logical_read, + offset_in_chunk, + len, + }| { + PhysicalInterest { + logical_read, + offset_in_physical_read: i + .checked_mul(DIO_CHUNK_SIZE) + .unwrap() + .into_u64() + + offset_in_chunk, + len, + } + }, + ) + }) + .collect(); + physical_reads.push(PhysicalRead { + start_chunk_no, + nchunks, + dsts, + }); + } + drop(by_chunk); + + // Execute physical reads and fill the logical read buffers + // TODO: pipelined reads; prefetch; + let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE); + for PhysicalRead { + start_chunk_no, + nchunks, + dsts, + } in physical_reads + { + let all_done = dsts + .iter() + .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal()); + if all_done { + continue; + } + let read_offset = start_chunk_no + .checked_mul(DIO_CHUNK_SIZE.into_u64()) + .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier"); + let io_buf = get_io_buffer(nchunks).slice_full(); + let req_len = io_buf.len(); + let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await + { + Ok(t) => t, + Err(e) => { + let e = Arc::new(e); + for PhysicalInterest { logical_read, .. } in dsts { + *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e)); + // this will make later reads for the given LogicalRead short-circuit, see top of loop body + } + continue; + } + }; + let io_buf = io_buf_slice.into_inner(); + assert!( + nread <= io_buf.len(), + "the last chunk in the file can be a short read, so, no ==" + ); + let io_buf = &io_buf[..nread]; + for PhysicalInterest { + logical_read, + offset_in_physical_read, + len, + } in dsts + { + let mut logical_read_state_borrow = logical_read.state.borrow_mut(); + let logical_read_buf = match &mut *logical_read_state_borrow { + LogicalReadState::NotStarted(_) => { + unreachable!("we transition it into Ongoing at function entry") + } + LogicalReadState::Ongoing(buf) => buf, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => { + continue; + } + LogicalReadState::Undefined => unreachable!(), + }; + let range_in_io_buf = std::ops::Range { + start: offset_in_physical_read as usize, + end: offset_in_physical_read as usize + len as usize, + }; + assert!(range_in_io_buf.end >= range_in_io_buf.start); + if range_in_io_buf.end > nread { + let msg = format!( + "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}", + &*logical_read_state_borrow + ); + logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + msg, + ))); + continue; + } + let data = &io_buf[range_in_io_buf]; + + // Copy data from io buffer into the logical read buffer. + // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.) + let pre = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + logical_read_buf.extend_from_slice(data); + let post = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + match (pre, post) { + (None, None) => {} + (Some(_), None) | (None, Some(_)) => unreachable!(), + (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => { + assert_eq!(pre_len + len as usize, post_len); + assert_eq!(pre_cap, post_cap); + } + } + + if logical_read_buf.len() == logical_read_buf.cap() { + logical_read_state_borrow.transition_to_terminal(Ok(())); + } + } + } + + if let Some(assert_logical_reads) = assert_logical_reads { + for logical_read in assert_logical_reads { + assert!(logical_read.state.borrow().is_terminal()); + } + } +} + +impl LogicalReadState { + fn is_terminal(&self) -> bool { + match self { + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true, + LogicalReadState::Undefined => unreachable!(), + } + } + fn transition_to_terminal(&mut self, err: std::io::Result<()>) { + let cur = std::mem::replace(self, LogicalReadState::Undefined); + let buf = match cur { + LogicalReadState::Ongoing(buf) => buf, + x => panic!("must only call in state Ongoing, got {x:?}"), + }; + *self = match err { + Ok(()) => LogicalReadState::Ok(buf), + Err(e) => LogicalReadState::Error(Arc::new(e)), + }; + } +} + +impl std::fmt::Debug for LogicalReadState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Debug)] + #[allow(unused)] + struct BufferDebug { + len: usize, + cap: usize, + } + impl<'a> From<&'a dyn Buffer> for BufferDebug { + fn from(buf: &'a dyn Buffer) -> Self { + Self { + len: buf.len(), + cap: buf.cap(), + } + } + } + match self { + LogicalReadState::NotStarted(b) => { + write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ongoing(b) => { + write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), + LogicalReadState::Error(e) => write!(f, "Error({:?})", e), + LogicalReadState::Undefined => write!(f, "Undefined"), + } + } +} + +#[derive(Debug)] +struct RwLockRefCell(RwLock); +impl RwLockRefCell { + fn new(value: T) -> Self { + Self(RwLock::new(value)) + } + fn borrow(&self) -> impl std::ops::Deref + '_ { + self.0.try_read().unwrap() + } + fn borrow_mut(&self) -> impl std::ops::DerefMut + '_ { + self.0.try_write().unwrap() + } + fn into_inner(self) -> T { + self.0.into_inner().unwrap() + } +} + +impl Buffer for Vec { + fn cap(&self) -> usize { + self.capacity() + } + + fn len(&self) -> usize { + self.len() + } + + fn extend_from_slice(&mut self, src: &[u8]) { + if self.len() + src.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + Vec::extend_from_slice(self, src); + } +} + +#[cfg(test)] +#[allow(clippy::assertions_on_constants)] +mod tests { + use rand::Rng; + + use crate::{ + context::DownloadBehavior, task_mgr::TaskKind, + virtual_file::owned_buffers_io::slice::SliceMutExt, + }; + + use super::*; + use std::{cell::RefCell, collections::VecDeque}; + + struct InMemoryFile { + content: Vec, + } + + impl InMemoryFile { + fn new_random(len: usize) -> Self { + Self { + content: rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(len) + .collect(), + } + } + fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead { + let expected_result = if pos as usize + len > self.content.len() { + Err("InMemoryFile short read".to_string()) + } else { + Ok(self.content[pos as usize..pos as usize + len].to_vec()) + }; + TestLogicalRead::new(pos, len, expected_result) + } + } + + #[test] + fn test_in_memory_file() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(10); + let test_read = |pos, len| { + let buf = vec![0; len]; + let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); + use futures::FutureExt; + let (slice, nread) = fut + .now_or_never() + .expect("impl never awaits") + .expect("impl never errors"); + let mut buf = slice.into_inner(); + buf.truncate(nread); + buf + }; + assert_eq!(test_read(0, 1), &file.content[0..1]); + assert_eq!(test_read(1, 2), &file.content[1..3]); + assert_eq!(test_read(9, 2), &file.content[9..]); + assert!(test_read(10, 2).is_empty()); + assert!(test_read(11, 2).is_empty()); + } + + impl File for InMemoryFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + let nread = { + let req_len = dst_slice.len(); + let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize)); + if start as usize >= self.content.len() { + 0 + } else { + dst_slice[..len] + .copy_from_slice(&self.content[start as usize..start as usize + len]); + len + } + }; + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs + Ok((dst, nread)) + } + } + + #[derive(Clone)] + struct TestLogicalRead { + pos: u64, + len: usize, + expected_result: Result, String>, + } + + impl TestLogicalRead { + fn new(pos: u64, len: usize, expected_result: Result, String>) -> Self { + Self { + pos, + len, + expected_result, + } + } + fn make_logical_read(&self) -> LogicalRead> { + LogicalRead::new(self.pos, Vec::with_capacity(self.len)) + } + } + + async fn execute_and_validate_test_logical_reads( + file: &F, + test_logical_reads: I, + ctx: &RequestContext, + ) where + I: IntoIterator, + F: File, + { + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::>(); + execute(file, logical_reads.iter(), ctx).await; + for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) { + let actual = logical_read.into_result().expect("we call execute()"); + match (actual, test_logical_read.expected_result) { + (Ok(actual), Ok(expected)) if actual == expected => {} + (Err(actual), Err(expected)) => { + assert_eq!(actual.to_string(), expected); + } + (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"), + } + } + } + + #[tokio::test] + async fn test_blackbox() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let cs = DIO_CHUNK_SIZE; + let cs_u64 = cs.into_u64(); + + let file = InMemoryFile::new_random(10 * cs); + + let test_logical_reads = vec![ + file.test_logical_read(0, 1), + // adjacent to logical_read0 + file.test_logical_read(1, 2), + // gap + // spans adjacent chunks + file.test_logical_read(cs_u64 - 1, 2), + // gap + // tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5 + file.test_logical_read(3 * cs_u64 - 1, cs + 2), + // gap + file.test_logical_read(5 * cs_u64, 1), + ]; + let num_test_logical_reads = test_logical_reads.len(); + let test_logical_reads_perms = test_logical_reads + .into_iter() + .permutations(num_test_logical_reads); + + // test all orderings of LogicalReads, the order shouldn't matter for the results + for test_logical_reads in test_logical_reads_perms { + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + #[tokio::test] + #[should_panic] + async fn test_reusing_logical_reads_panics() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(DIO_CHUNK_SIZE); + let a = file.test_logical_read(23, 10); + let logical_reads = vec![a.make_logical_read()]; + execute(&file, &logical_reads, &ctx).await; + // reuse pancis + execute(&file, &logical_reads, &ctx).await; + } + + struct RecorderFile<'a> { + recorded: RefCell>, + file: &'a InMemoryFile, + } + + struct RecordedRead { + pos: u64, + req_len: usize, + res: Vec, + } + + impl<'a> RecorderFile<'a> { + fn new(file: &'a InMemoryFile) -> RecorderFile<'a> { + Self { + recorded: Default::default(), + file, + } + } + } + + impl<'x> File for RecorderFile<'x> { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; + self.recorded.borrow_mut().push(RecordedRead { + pos: start, + req_len: dst.bytes_total(), + res: Vec::from(&dst[..nread]), + }); + Ok((dst, nread)) + } + } + + #[tokio::test] + async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10); + let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20); + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + + #[tokio::test] + async fn test_max_chunk_batch_size_is_respected() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + + // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE + assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption"); + assert!(10 < DIO_CHUNK_SIZE, "test assumption"); + let mut test_logical_reads = Vec::new(); + for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 { + test_logical_reads + .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1)); + } + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE); + } + } + + #[tokio::test] + async fn test_batch_breaks_if_chunk_is_not_interesting() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption"); + let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(0, 1); // chunk 0 + let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2 + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, 0); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + } + + struct ExpectedRead { + expect_pos: u64, + expect_len: usize, + respond: Result, String>, + } + + struct MockFile { + expected: RefCell>, + } + + impl Drop for MockFile { + fn drop(&mut self) { + assert!( + self.expected.borrow().is_empty(), + "expected reads not satisfied" + ); + } + } + + macro_rules! mock_file { + ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{ + MockFile { + expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead { + expect_pos: $pos, + expect_len: $len, + respond: $respond, + }),*])), + } + }}; + } + + impl File for MockFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let ExpectedRead { + expect_pos, + expect_len, + respond, + } = self + .expected + .borrow_mut() + .pop_front() + .expect("unexpected read"); + assert_eq!(start, expect_pos); + assert_eq!(dst.bytes_total(), expect_len); + match respond { + Ok(mocked_bytes) => { + let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len()); + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + dst_slice[..len].copy_from_slice(&mocked_bytes[..len]); + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs + Ok((dst, len)) + } + Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), + } + } + } + + #[tokio::test] + async fn test_mock_file() { + // Self-test to ensure the relevant features of mock file work as expected. + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let mock_file = mock_file! { + 0 , 512 => Ok(vec![0; 512]), + 512 , 512 => Ok(vec![1; 512]), + 1024 , 512 => Ok(vec![2; 10]), + 2048, 1024 => Err("foo".to_owned()), + }; + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[0; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[1; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 10); + assert_eq!(&buf.into_inner()[..nread], &[2; 10]); + + let buf = Vec::with_capacity(1024); + let err = mock_file + .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) + .await + .err() + .unwrap(); + assert_eq!(err.to_string(), "foo"); + } + + #[tokio::test] + async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let test_logical_reads = vec![ + // read spanning two batches + TestLogicalRead::new( + DIO_CHUNK_SIZE.into_u64() / 2, + MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE, + Err("foo".to_owned()), + ), + // second read in failing chunk + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10, + 5, + Err("foo".to_owned()), + ), + // read unaffected + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + + 2 * DIO_CHUNK_SIZE.into_u64() + + 10, + 5, + Ok(vec![1; 5]), + ), + ]; + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let test_logical_read_perms = tmp.permutations(test_logical_reads.len()); + + for test_logical_reads in test_logical_read_perms { + let file = mock_file!( + 0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]), + ); + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + struct TestShortReadsSetup { + ctx: RequestContext, + file: InMemoryFile, + written: u64, + } + fn setup_short_chunk_read_tests() -> TestShortReadsSetup { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + assert!(DIO_CHUNK_SIZE > 20, "test assumption"); + let written = (2 * DIO_CHUNK_SIZE - 10).into_u64(); + let file = InMemoryFile::new_random(written as usize); + TestShortReadsSetup { ctx, file, written } + } + + #[tokio::test] + async fn test_short_chunk_read_from_written_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length. + // + // The read should succeed despite the short chunk length. + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let a = file.test_logical_read(written - 10, 5); + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + #[tokio::test] + async fn test_short_chunk_read_and_logical_read_from_unwritten_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length, and + // the logical reads end in the unwritten range. + // + // All should fail with UnexpectedEof and have the same IO pattern. + async fn the_impl(offset_delta: i64) { + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let offset = u64::try_from( + i64::try_from(written) + .unwrap() + .checked_add(offset_delta) + .unwrap(), + ) + .unwrap(); + let a = file.test_logical_read(offset, 5); + let recorder = RecorderFile::new(&file); + let a_vr = a.make_logical_read(); + execute(&recorder, vec![&a_vr], &ctx).await; + + // validate the LogicalRead result + let a_res = a_vr.into_result().unwrap(); + let a_err = a_res.unwrap_err(); + assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof); + + // validate the IO pattern + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + the_impl(-1).await; // start == length - 1 + the_impl(0).await; // start == length + the_impl(1).await; // start == length + 1 + } + + // TODO: mixed: some valid, some UnexpectedEof + + // TODO: same tests but with merges +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 774f97e1d9..b15cd4da39 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -13,8 +13,7 @@ use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::repository::Key; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::timeline::{CompactionError, GetVectoredError}; @@ -35,6 +34,8 @@ mod tests; #[cfg(test)] mod failpoints; +pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -332,23 +333,6 @@ impl Layer { }) } - /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. - #[allow(dead_code)] - pub(crate) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let layer = self - .0 - .get_or_maybe_download(true, Some(ctx)) - .await - .map_err(|err| match err { - DownloadError::DownloadCancelled => GetVectoredError::Cancelled, - other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; - layer.load_key_values(&self.0, ctx).await - } - /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -1296,7 +1280,10 @@ impl LayerInner { lsn_end: lsn_range.end, remote: !resident, access_stats, - l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range), + l0: crate::tenant::layer_map::LayerMap::is_l0( + &self.layer_desc().key_range, + self.layer_desc().is_delta, + ), } } else { let lsn = self.desc.image_layer_lsn(); @@ -1489,8 +1476,9 @@ impl LayerInner { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - let accessed = self.access_stats.accessed(); - if accessed { + let accessed_and_visible = self.access_stats.accessed() + && self.access_stats.visibility() == LayerVisibilityHint::Visible; + if accessed_and_visible { // Only layers used for reads contribute to our "low residence" metric that is used // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed // to be rapidly evicted without contributing to this metric. @@ -1504,7 +1492,7 @@ impl LayerInner { tracing::info!( residence_millis = elapsed.as_millis(), - accessed, + accessed_and_visible, "evicted layer after known residence period" ); } @@ -1690,6 +1678,9 @@ impl DownloadedLayer { ); let res = if owner.desc.is_delta { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) + .build(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, @@ -1700,11 +1691,14 @@ impl DownloadedLayer { &owner.path, summary, Some(owner.conf.max_vectored_read_bytes), - ctx, + &ctx, ) .await .map(LayerKind::Delta) } else { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) + .build(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, @@ -1717,7 +1711,7 @@ impl DownloadedLayer { lsn, summary, Some(owner.conf.max_vectored_read_bytes), - ctx, + &ctx, ) .await .map(LayerKind::Image) @@ -1771,19 +1765,6 @@ impl DownloadedLayer { } } - async fn load_key_values( - &self, - owner: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result> { - use LayerKind::*; - - match self.get(owner, ctx).await? { - Delta(d) => d.load_key_values(ctx).await, - Image(i) => i.load_key_values(ctx).await, - } - } - async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index bffd2db800..0b9bde4f57 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() { let mut writer = timeline.writer().await; writer .put( - Key::from_i128(5), + crate::repository::Key::from_i128(5), Lsn(0x20), &Value::Image(Bytes::from_static(b"this does not matter either")), &ctx, diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index cbd18e650f..e90ff3c4b2 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize}; #[cfg(test)] use utils::id::TenantId; -/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the -/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides +/// A unique identifier of a persistent layer. +/// +/// This is different from `LayerDescriptor`, which is only used in the benchmarks. +/// This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index f33ca076ab..ffe7ca5f3e 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName { } } -/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The -/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. +/// +/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations /// over time (e.g. across shard splits or compression). The physical filenames of layers in local /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) @@ -256,6 +257,10 @@ impl LayerName { LayerName::Delta(layer) => &layer.key_range, } } + + pub fn is_delta(&self) -> bool { + matches!(self, LayerName::Delta(_)) + } } impl fmt::Display for LayerName { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index b4bd976033..0831fd9530 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -3,6 +3,7 @@ use std::{ collections::{binary_heap, BinaryHeap}, }; +use anyhow::bail; use pageserver_api::key::Key; use utils::lsn::Lsn; @@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> { Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), } } + + fn layer_dbg_info(&self) -> String { + match self { + Self::Image(x) => x.layer_dbg_info(), + Self::Delta(x) => x.layer_dbg_info(), + } + } } enum LayerIterRef<'a> { @@ -40,6 +48,13 @@ impl LayerIterRef<'_> { Self::Image(x) => x.next().await, } } + + fn layer_dbg_info(&self) -> String { + match self { + Self::Image(x) => x.layer_dbg_info(), + Self::Delta(x) => x.layer_dbg_info(), + } + } } /// This type plays several roles at once @@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> { async fn next(&mut self) -> anyhow::Result> { let result = self.peeked.take(); self.peeked = self.iter.next().await?; + if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) { + if (k1, l1) < (k2, l2) { + bail!("iterator is not ordered: {}", self.iter.layer_dbg_info()); + } + } Ok(result) } } @@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> { let iter = PeekableLayerIterRef::create(iter).await?; if let Some((k1, l1, _)) = iter.peek() { let (k2, l2) = first_key_lower_bound; - debug_assert!((k1, l1) >= (k2, l2)); + if (k1, l1) < (k2, l2) { + bail!( + "layer key range did not include the first key in the layer: {}", + layer.layer_dbg_info() + ); + } } *self = Self::Loaded { iter }; Ok(()) @@ -201,9 +226,11 @@ impl<'a> IteratorWrapper<'a> { } } -/// A merge iterator over delta/image layer iterators. When duplicated records are -/// found, the iterator will not perform any deduplication, and the caller should handle -/// these situation. By saying duplicated records, there are many possibilities: +/// A merge iterator over delta/image layer iterators. +/// +/// When duplicated records are found, the iterator will not perform any +/// deduplication, and the caller should handle these situation. By saying +/// duplicated records, there are many possibilities: /// /// * Two same delta at the same LSN. /// * Two same image at the same LSN. diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index e12e29cd45..b499a0eef4 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -1,4 +1,4 @@ -use std::{ops::Range, sync::Arc}; +use std::{future::Future, ops::Range, sync::Arc}; use bytes::Bytes; use pageserver_api::key::{Key, KEY_SIZE}; @@ -7,20 +7,47 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; use crate::tenant::storage_layer::Layer; use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; -use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer}; +use super::layer::S3_UPLOAD_LIMIT; +use super::{ + DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, +}; -/// An image writer that takes images and produces multiple image layers. The interface does not -/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files -/// to be cleaned up) +pub(crate) enum SplitWriterResult { + Produced(ResidentLayer), + Discarded(PersistentLayerKey), +} + +#[cfg(test)] +impl SplitWriterResult { + fn into_resident_layer(self) -> ResidentLayer { + match self { + SplitWriterResult::Produced(layer) => layer, + SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"), + } + } + + fn into_discarded_layer(self) -> PersistentLayerKey { + match self { + SplitWriterResult::Produced(_) => panic!("unexpected produced layer"), + SplitWriterResult::Discarded(layer) => layer, + } + } +} + +/// An image writer that takes images and produces multiple image layers. +/// +/// The interface does not guarantee atomicity (i.e., if the image layer generation +/// fails, there might be leftover files to be cleaned up) #[must_use] pub struct SplitImageLayerWriter { inner: ImageLayerWriter, target_layer_size: u64, - generated_layers: Vec, + generated_layers: Vec, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn: Lsn, + start_key: Key, } impl SplitImageLayerWriter { @@ -49,16 +76,22 @@ impl SplitImageLayerWriter { timeline_id, tenant_shard_id, lsn, + start_key, }) } - pub async fn put_image( + pub async fn put_image_with_discard_fn( &mut self, key: Key, img: Bytes, tline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result<()> { + discard: D, + ) -> anyhow::Result<()> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -76,49 +109,109 @@ impl SplitImageLayerWriter { ) .await?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); - self.generated_layers.push( - prev_image_writer - .finish_with_end_key(tline, key, ctx) - .await?, - ); + let layer_key = PersistentLayerKey { + key_range: self.start_key..key, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), + is_delta: false, + }; + self.start_key = key; + + if discard(&layer_key).await { + drop(prev_image_writer); + self.generated_layers + .push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?; + + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers + .push(SplitWriterResult::Produced(layer)); + } } self.inner.put_image(key, img, ctx).await } - pub(crate) async fn finish( + #[cfg(test)] + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false }) + .await + } + + pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, end_key: Key, - ) -> anyhow::Result> { + discard: D, + ) -> anyhow::Result> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { let Self { mut generated_layers, inner, .. } = self; - generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?); + if inner.num_keys() == 0 { + return Ok(generated_layers); + } + let layer_key = PersistentLayerKey { + key_range: self.start_key..end_key, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), + is_delta: false, + }; + if discard(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?; + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(SplitWriterResult::Produced(layer)); + } Ok(generated_layers) } - /// When split writer fails, the caller should call this function and handle partially generated layers. - #[allow(dead_code)] - pub(crate) async fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { + #[cfg(test)] + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + end_key: Key, + ) -> anyhow::Result> { + self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) + .await + } + + /// This function will be deprecated with #8841. + pub(crate) fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { Ok((self.generated_layers, self.inner)) } } -/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not -/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files -/// to be cleaned up). +/// A delta writer that takes key-lsn-values and produces multiple delta layers. +/// +/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails, +/// there might be leftover files to be cleaned up). +/// +/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched +/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm +/// will split them into multiple files based on size. #[must_use] pub struct SplitDeltaLayerWriter { - inner: DeltaLayerWriter, + inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, - generated_layers: Vec, + generated_layers: Vec, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn_range: Range, + last_key_written: Key, } impl SplitDeltaLayerWriter { @@ -126,30 +219,102 @@ impl SplitDeltaLayerWriter { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - start_key: Key, lsn_range: Range, target_layer_size: u64, - ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { target_layer_size, - inner: DeltaLayerWriter::new( - conf, - timeline_id, - tenant_shard_id, - start_key, - lsn_range.clone(), - ctx, - ) - .await?, + inner: None, generated_layers: Vec::new(), conf, timeline_id, tenant_shard_id, lsn_range, + last_key_written: Key::MIN, }) } + /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end. + pub async fn put_value_with_discard_fn( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + tline: &Arc, + ctx: &RequestContext, + discard: D, + ) -> anyhow::Result<()> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { + // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate + // number, and therefore the final layer size could be a little bit larger or smaller than the target. + // + // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction + // strategy. https://github.com/neondatabase/neon/issues/8837 + + if self.inner.is_none() { + self.inner = Some(( + key, + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?, + )); + } + let (_, inner) = self.inner.as_mut().unwrap(); + + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; + if inner.num_keys() >= 1 + && inner.estimated_size() + addition_size_estimation >= self.target_layer_size + { + if key != self.last_key_written { + let next_delta_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?; + let (start_key, prev_delta_writer) = + std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); + let layer_key = PersistentLayerKey { + key_range: start_key..key, + lsn_range: self.lsn_range.clone(), + is_delta: true, + }; + if discard(&layer_key).await { + drop(prev_delta_writer); + self.generated_layers + .push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = prev_delta_writer.finish(key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers + .push(SplitWriterResult::Produced(delta_layer)); + } + } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { + // We have to produce a very large file b/c a key is updated too often. + anyhow::bail!( + "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", + key, + inner.estimated_size() + ); + } + } + self.last_key_written = key; + let (_, inner) = self.inner.as_mut().unwrap(); + inner.put_value(key, lsn, val, ctx).await + } + pub async fn put_value( &mut self, key: Key, @@ -158,56 +323,66 @@ impl SplitDeltaLayerWriter { tline: &Arc, ctx: &RequestContext, ) -> anyhow::Result<()> { - // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate - // number, and therefore the final layer size could be a little bit larger or smaller than the target. - let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; - if self.inner.num_keys() >= 1 - && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size - { - let next_delta_writer = DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - key, - self.lsn_range.clone(), - ctx, - ) - .await?; - let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); - let (desc, path) = prev_delta_writer.finish(key, ctx).await?; - let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - self.generated_layers.push(delta_layer); - } - self.inner.put_value(key, lsn, val, ctx).await + self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false }) + .await } - pub(crate) async fn finish( + pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, - end_key: Key, - ) -> anyhow::Result> { + discard: D, + ) -> anyhow::Result> + where + D: FnOnce(&PersistentLayerKey) -> F, + F: Future, + { let Self { mut generated_layers, inner, .. } = self; - - let (desc, path) = inner.finish(end_key, ctx).await?; - let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - generated_layers.push(delta_layer); + let Some((start_key, inner)) = inner else { + return Ok(generated_layers); + }; + if inner.num_keys() == 0 { + return Ok(generated_layers); + } + let end_key = self.last_key_written.next(); + let layer_key = PersistentLayerKey { + key_range: start_key..end_key, + lsn_range: self.lsn_range.clone(), + is_delta: true, + }; + if discard(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + let (desc, path) = inner.finish(end_key, ctx).await?; + let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(SplitWriterResult::Produced(delta_layer)); + } Ok(generated_layers) } - /// When split writer fails, the caller should call this function and handle partially generated layers. - #[allow(dead_code)] - pub(crate) async fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { - Ok((self.generated_layers, self.inner)) + #[cfg(test)] + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + self.finish_with_discard_fn(tline, ctx, |_| async { false }) + .await + } + + /// This function will be deprecated with #8841. + pub(crate) fn take(self) -> anyhow::Result<(Vec, Option)> { + Ok((self.generated_layers, self.inner.map(|x| x.1))) } } #[cfg(test)] mod tests { + use itertools::Itertools; use rand::{RngCore, SeedableRng}; use crate::{ @@ -265,10 +440,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -293,18 +466,36 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) - .await - .unwrap(); + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 1); + assert_eq!( + layers + .into_iter() + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); } #[tokio::test] async fn write_split() { - let harness = TenantHarness::create("split_writer_write_split") - .await - .unwrap(); + write_split_helper("split_writer_write_split", false).await; + } + + #[tokio::test] + async fn write_split_discard() { + write_split_helper("split_writer_write_split_discard", false).await; + } + + async fn write_split_helper(harness_name: &'static str, discard: bool) { + let harness = TenantHarness::create(harness_name).await.unwrap(); let (tenant, ctx) = harness.load().await; let tline = tenant @@ -327,10 +518,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -338,16 +527,19 @@ mod tests { for i in 0..N { let i = i as u32; image_writer - .put_image(get_key(i), get_large_img(), &tline, &ctx) + .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async { + discard + }) .await .unwrap(); delta_writer - .put_value( + .put_value_with_discard_fn( get_key(i), Lsn(0x20), Value::Image(get_large_img()), &tline, &ctx, + |_| async { discard }, ) .await .unwrap(); @@ -356,26 +548,48 @@ mod tests { .finish(&tline, &ctx, get_key(N as u32)) .await .unwrap(); - let delta_layers = delta_writer - .finish(&tline, &ctx, get_key(N as u32)) - .await - .unwrap(); - assert_eq!(image_layers.len(), N / 512 + 1); - assert_eq!(delta_layers.len(), N / 512 + 1); - for idx in 0..image_layers.len() { - assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); - assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); - assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); - assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); - if idx > 0 { - assert_eq!( - image_layers[idx - 1].layer_desc().key_range.end, - image_layers[idx].layer_desc().key_range.start - ); - assert_eq!( - delta_layers[idx - 1].layer_desc().key_range.end, - delta_layers[idx].layer_desc().key_range.start - ); + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); + if discard { + for layer in image_layers { + layer.into_discarded_layer(); + } + for layer in delta_layers { + layer.into_discarded_layer(); + } + } else { + let image_layers = image_layers + .into_iter() + .map(|x| x.into_resident_layer()) + .collect_vec(); + let delta_layers = delta_layers + .into_iter() + .map(|x| x.into_resident_layer()) + .collect_vec(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + assert_eq!( + delta_layers.first().unwrap().layer_desc().key_range.start, + get_key(0) + ); + assert_eq!( + delta_layers.last().unwrap().layer_desc().key_range.end, + get_key(N as u32) + ); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); + if idx > 0 { + assert_eq!( + image_layers[idx - 1].layer_desc().key_range.end, + image_layers[idx].layer_desc().key_range.start + ); + assert_eq!( + delta_layers[idx - 1].layer_desc().key_range.end, + delta_layers[idx].layer_desc().key_range.start + ); + } } } } @@ -408,10 +622,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024, - &ctx, ) .await .unwrap(); @@ -450,10 +662,87 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); + assert_eq!(layers.len(), 2); + let mut layers_iter = layers.into_iter(); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); + } + + #[tokio::test] + async fn write_split_single_key() { + let harness = TenantHarness::create("split_writer_write_split_single_key") .await .unwrap(); - assert_eq!(layers.len(), 2); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + const N: usize = 2000; + let mut delta_writer = SplitDeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), + 4 * 1024 * 1024, + ) + .await + .unwrap(); + + for i in 0..N { + let i = i as u32; + delta_writer + .put_value( + get_key(0), + Lsn(i as u64 * 16 + 0x10), + Value::Image(get_large_img()), + &tline, + &ctx, + ) + .await + .unwrap(); + } + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); + assert_eq!(delta_layers.len(), 1); + let delta_layer = delta_layers + .into_iter() + .next() + .unwrap() + .into_resident_layer(); + assert_eq!( + delta_layer.layer_desc().key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), + is_delta: true + } + ); } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 12f080f3c1..478e9bb4f0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; @@ -192,20 +191,28 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } - let started_at = Instant::now(); - let sleep_duration = if period == Duration::ZERO { + + let sleep_duration; + if period == Duration::ZERO { #[cfg(not(feature = "testing"))] info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10) } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + // Run compaction - match tenant.compaction_iteration(&cancel, &ctx).await { + let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await; + match output { Ok(has_pending_task) => { error_run_count = 0; // schedule the next compaction immediately in case there is a pending compaction task - if has_pending_task { Duration::ZERO } else { period } + sleep_duration = if has_pending_task { Duration::ZERO } else { period }; } Err(e) => { let wait_duration = backoff::exponential_backoff_duration_seconds( @@ -221,16 +228,14 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { &wait_duration, cancel.is_cancelled(), ); - wait_duration + sleep_duration = wait_duration; } } + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); }; - let elapsed = started_at.elapsed(); - warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); - - // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. @@ -368,23 +373,27 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } } - let started_at = Instant::now(); - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { #[cfg(not(feature = "testing"))] info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10); } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Gc, + }; // Run gc - let res = tenant - .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx) + let IterationResult { output, elapsed: _ } = + iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) .await; - match res { + match output { Ok(_) => { error_run_count = 0; - period + sleep_duration = period; } Err(crate::tenant::GcError::TenantCancelled) => { return; @@ -408,13 +417,11 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); } - wait_duration + sleep_duration = wait_duration; } } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() @@ -448,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken // If compaction period is set to zero (to disable it), then we will use a reasonable default let period = if period == Duration::ZERO { - humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) - .unwrap() - .into() + humantime::Duration::from_str( + pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, + ) + .unwrap() + .into() } else { period }; @@ -468,14 +477,12 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken break; } - let started_at = Instant::now(); - tenant.ingest_housekeeping().await; - - warn_when_period_overrun( - started_at.elapsed(), + let iteration = Iteration { + started_at: Instant::now(), period, - BackgroundLoopKind::IngestHouseKeeping, - ); + kind: BackgroundLoopKind::IngestHouseKeeping, + }; + iteration.run(tenant.ingest_housekeeping()).await; } } .await; @@ -553,6 +560,54 @@ pub(crate) async fn delay_by_lease_length( } } +struct Iteration { + started_at: Instant, + period: Duration, + kind: BackgroundLoopKind, +} + +struct IterationResult { + output: O, + elapsed: Duration, +} + +impl Iteration { + #[instrument(skip_all)] + pub(crate) async fn run(self, fut: Fut) -> IterationResult + where + Fut: std::future::Future, + { + let Self { + started_at, + period, + kind, + } = self; + + let mut fut = std::pin::pin!(fut); + + // Wrap `fut` into a future that logs a message every `period` so that we get a + // very obvious breadcrumb in the logs _while_ a slow iteration is happening. + let liveness_logger = async move { + loop { + match tokio::time::timeout(period, &mut fut).await { + Ok(x) => return x, + Err(_) => { + // info level as per the same rationale why warn_when_period_overrun is info + // => https://github.com/neondatabase/neon/pull/5724 + info!("still running"); + } + } + } + }; + + let output = liveness_logger.await; + + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, kind); + + IterationResult { output, elapsed } + } +} /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun( elapsed: Duration, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index f3f3d5e3ae..f222e708e1 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -10,6 +10,7 @@ use std::{ use arc_swap::ArcSwap; use enumset::EnumSet; use tracing::{error, warn}; +use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; use crate::{context::RequestContext, task_mgr::TaskKind}; @@ -33,8 +34,7 @@ pub struct Throttle { pub struct Inner { task_kinds: EnumSet, - rate_limiter: Arc, - config: Config, + rate_limiter: Arc, } pub type Config = pageserver_api::models::ThrottleConfig; @@ -77,8 +77,7 @@ where refill_interval, refill_amount, max, - fair, - } = &config; + } = config; let task_kinds: EnumSet = task_kinds .iter() .filter_map(|s| match TaskKind::from_str(s) { @@ -93,18 +92,21 @@ where } }) .collect(); + + // steady rate, we expect `refill_amount` requests per `refill_interval`. + // dividing gives us the rps. + let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64(); + let config = LeakyBucketConfig::new(rps, f64::from(max)); + + // initial tracks how many tokens are available to put in the bucket + // we want how many tokens are currently in the bucket + let initial_tokens = max - initial; + + let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); + Inner { task_kinds, - rate_limiter: Arc::new( - leaky_bucket::RateLimiter::builder() - .initial(*initial) - .interval(*refill_interval) - .refill(refill_amount.get()) - .max(*max) - .fair(*fair) - .build(), - ), - config, + rate_limiter: Arc::new(rate_limiter), } } pub fn reconfigure(&self, config: Config) { @@ -127,7 +129,7 @@ where /// See [`Config::steady_rps`]. pub fn steady_rps(&self) -> f64 { - self.inner.load().config.steady_rps() + self.inner.load().rate_limiter.steady_rps() } pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { @@ -136,18 +138,9 @@ where return None; }; let start = std::time::Instant::now(); - let mut did_throttle = false; - let acquire = inner.rate_limiter.acquire(key_count); - // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate - let acquire = tokio::task::unconstrained(acquire); - let mut acquire = std::pin::pin!(acquire); - std::future::poll_fn(|cx| { - use std::future::Future; - let poll = acquire.as_mut().poll(cx); - did_throttle = did_throttle || poll.is_pending(); - poll - }) - .await; + + let did_throttle = inner.rate_limiter.acquire(key_count).await; + self.count_accounted.fetch_add(1, Ordering::Relaxed); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 80e3843021..f66491d962 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -22,8 +22,8 @@ use handle::ShardTimelineId; use once_cell::sync::Lazy; use pageserver_api::{ key::{ - KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, + CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ @@ -44,10 +44,8 @@ use tokio::{ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ - bin_ser::BeSer, fs_ext, pausable_failpoint, sync::gate::{Gate, GateGuard}, - vec_map::VecMap, }; use std::pin::pin; @@ -68,10 +66,9 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ - config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, - storage_layer::PersistentLayerDesc, + storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, }, walredo, }; @@ -104,6 +101,7 @@ use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; @@ -137,7 +135,10 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized}; +use super::{ + config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + upload_queue::NotInitialized, +}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{ @@ -217,7 +218,7 @@ pub(crate) struct RelSizeCache { } pub struct Timeline { - conf: &'static PageServerConf, + pub(crate) conf: &'static PageServerConf, tenant_conf: Arc>, myself: Weak, @@ -866,6 +867,11 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the ancestor timeline + pub(crate) fn ancestor_timeline(&self) -> Option<&Arc> { + self.ancestor_timeline.as_ref() + } + /// Get the bytes written since the PITR cutoff on this branch, and /// whether this branch's ancestor_lsn is within its parent's PITR. pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { @@ -1906,6 +1912,8 @@ impl Timeline { true } else if projected_layer_size >= checkpoint_distance { + // NB: this check is relied upon by: + let _ = IndexEntry::validate_checkpoint_distance; info!( "Will roll layer at {} with layer size {} due to layer size ({})", projected_lsn, layer_size, projected_layer_size @@ -2233,6 +2241,11 @@ impl Timeline { handles: Default::default(), }; + + if aux_file_policy == Some(AuxFilePolicy::V1) { + warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)"); + } + result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -2996,7 +3009,10 @@ impl Timeline { // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner // the layer is likely to be covered by an image layer during compaction. layers.sort_by_key(|(desc, _meta, _atime)| { - std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end)) + std::cmp::Reverse(( + !LayerMap::is_l0(&desc.key_range, desc.is_delta), + desc.lsn_range.end, + )) }); let layers = layers @@ -3997,7 +4013,8 @@ impl Timeline { if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -4085,7 +4102,8 @@ impl Timeline { if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -4521,7 +4539,6 @@ pub struct DeltaLayerTestDesc { #[cfg(test)] impl DeltaLayerTestDesc { - #[allow(dead_code)] pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { Self { lsn_range, @@ -4579,7 +4596,7 @@ impl Timeline { // for compact_level0_phase1 creating an L0, which does not happen in practice // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); - } else if LayerMap::is_l0(&l.layer_desc().key_range) { + } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); } else { insert_layers.push(l.clone()); @@ -5388,7 +5405,8 @@ impl Timeline { for (key, img) in images { image_layer_writer.put_image(key, img, ctx).await?; } - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; { let mut guard = self.layers.write().await; @@ -5435,12 +5453,17 @@ impl Timeline { !(a.end <= b.start || b.end <= a.start) } - let guard = self.layers.read().await; - for layer in guard.layer_map()?.iter_historic_layers() { - if layer.is_delta() - && overlaps_with(&layer.lsn_range, &deltas.lsn_range) - && layer.lsn_range != deltas.lsn_range - { + if deltas.key_range.start.next() != deltas.key_range.end { + let guard = self.layers.read().await; + let mut invalid_layers = + guard.layer_map()?.iter_historic_layers().filter(|layer| { + layer.is_delta() + && overlaps_with(&layer.lsn_range, &deltas.lsn_range) + && layer.lsn_range != deltas.lsn_range + // skip single-key layer files + && layer.key_range.start.next() != layer.key_range.end + }); + if let Some(layer) = invalid_layers.next() { // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic panic!( "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", @@ -5574,44 +5597,6 @@ enum OpenLayerAction { } impl<'a> TimelineWriter<'a> { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - pub(crate) async fn put( - &mut self, - key: Key, - lsn: Lsn, - value: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - value.ser_into(&mut buf)?; - let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); - - let action = self.get_open_layer_action(lsn, buf_size); - let layer = self.handle_open_layer_action(lsn, action, ctx).await?; - let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await; - - if res.is_ok() { - // Update the current size only when the entire write was ok. - // In case of failures, we may have had partial writes which - // render the size tracking out of sync. That's ok because - // the checkpoint distance should be significantly smaller - // than the S3 single shot upload limit of 5GiB. - let state = self.write_guard.as_mut().unwrap(); - - state.current_size += buf_size; - state.prev_lsn = Some(lsn); - state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn)); - } - - res - } - async fn handle_open_layer_action( &mut self, at: Lsn, @@ -5717,18 +5702,64 @@ impl<'a> TimelineWriter<'a> { } /// Put a batch of keys at the specified Lsns. - /// - /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`]. pub(crate) async fn put_batch( &mut self, - batch: VecMap, + batch: Vec<(CompactKey, Lsn, usize, Value)>, ctx: &RequestContext, ) -> anyhow::Result<()> { - for (lsn, (key, val)) in batch { - self.put(key, lsn, &val, ctx).await? + if batch.is_empty() { + return Ok(()); } - Ok(()) + let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?; + let batch_max_lsn = serialized_batch.max_lsn; + let buf_size: u64 = serialized_batch.raw.len() as u64; + + let action = self.get_open_layer_action(batch_max_lsn, buf_size); + let layer = self + .handle_open_layer_action(batch_max_lsn, action, ctx) + .await?; + + let res = layer.put_batch(serialized_batch, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(batch_max_lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn)); + } + + res + } + + #[cfg(test)] + /// Test helper, for tests that would like to poke individual values without composing a batch + pub(crate) async fn put( + &mut self, + key: Key, + lsn: Lsn, + value: &Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use utils::bin_ser::BeSer; + if !key.is_valid_key_on_write_path() { + bail!( + "the request contains data not supported by pageserver at TimelineWriter::put: {}", + key + ); + } + let val_ser_size = value.serialized_size().unwrap() as usize; + self.put_batch( + vec![(key.to_compact(), lsn, val_ser_size, value.clone())], + ctx, + ) + .await } pub(crate) async fn delete_batch( @@ -5869,7 +5900,7 @@ mod tests { }; // Apart from L0s, newest Layers should come first - if !LayerMap::is_l0(layer.name.key_range()) { + if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) { assert!(layer_lsn <= last_lsn); last_lsn = layer_lsn; } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 7370ec1386..d1f06e3480 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -14,7 +14,7 @@ use super::{ RecordedDuration, Timeline, }; -use anyhow::{anyhow, Context}; +use anyhow::{anyhow, bail, Context}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; @@ -29,9 +29,11 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; +use crate::tenant::storage_layer::split_writer::{ + SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult, +}; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; @@ -40,6 +42,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use pageserver_api::config::tenant_conf_defaults::{ + DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, +}; use crate::keyspace::KeySpace; use crate::repository::{Key, Value}; @@ -71,15 +76,60 @@ pub(crate) struct KeyHistoryRetention { } impl KeyHistoryRetention { + /// Hack: skip delta layer if we need to produce a layer of a same key-lsn. + /// + /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. + /// For example, consider the case where a single delta with range [0x10,0x50) exists. + /// And we have branches at LSN 0x10, 0x20, 0x30. + /// Then we delete branch @ 0x20. + /// Bottom-most compaction may now delete the delta [0x20,0x30). + /// And that wouldnt' change the shape of the layer. + /// + /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. + /// + /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside. + async fn discard_key(key: &PersistentLayerKey, tline: &Arc, dry_run: bool) -> bool { + if dry_run { + return true; + } + let guard = tline.layers.read().await; + if !guard.contains_key(key) { + return false; + } + let layer_generation = guard.get_from_key(key).metadata().generation; + drop(guard); + if layer_generation == tline.generation { + info!( + key=%key, + ?layer_generation, + "discard layer due to duplicated layer key in the same generation", + ); + true + } else { + false + } + } + + /// Pipe a history of a single key to the writers. + /// + /// If `image_writer` is none, the images will be placed into the delta layers. + /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images. + #[allow(clippy::too_many_arguments)] async fn pipe_to( self, key: Key, - delta_writer: &mut Vec<(Key, Lsn, Value)>, - mut image_writer: Option<&mut ImageLayerWriter>, + tline: &Arc, + delta_writer: &mut SplitDeltaLayerWriter, + mut image_writer: Option<&mut SplitImageLayerWriter>, stat: &mut CompactionStatistics, + dry_run: bool, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut first_batch = true; + let discard = |key: &PersistentLayerKey| { + let key = key.clone(); + async move { Self::discard_key(&key, tline, dry_run).await } + }; for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { if first_batch { if logs.len() == 1 && logs[0].1.is_image() { @@ -88,28 +138,45 @@ impl KeyHistoryRetention { }; stat.produce_image_key(img); if let Some(image_writer) = image_writer.as_mut() { - image_writer.put_image(key, img.clone(), ctx).await?; + image_writer + .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard) + .await?; } else { - delta_writer.push((key, cutoff_lsn, Value::Image(img.clone()))); + delta_writer + .put_value_with_discard_fn( + key, + cutoff_lsn, + Value::Image(img.clone()), + tline, + ctx, + discard, + ) + .await?; } } else { for (lsn, val) in logs { stat.produce_key(&val); - delta_writer.push((key, lsn, val)); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; } } first_batch = false; } else { for (lsn, val) in logs { stat.produce_key(&val); - delta_writer.push((key, lsn, val)); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; } } } let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; for (lsn, val) in above_horizon_logs { stat.produce_key(&val); - delta_writer.push((key, lsn, val)); + delta_writer + .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) + .await?; } Ok(()) } @@ -496,10 +563,12 @@ impl Timeline { .await?; if keys_written > 0 { - let new_layer = image_layer_writer - .finish(self, ctx) + let (desc, path) = image_layer_writer + .finish(ctx) .await .map_err(CompactionError::Other)?; + let new_layer = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -844,137 +913,13 @@ impl Timeline { // we're compacting, in key, LSN order. // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), // then the Value::Image is ordered before Value::WalRecord. - // - // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io - // option and validation code once we've reached confidence. - enum AllValuesIter<'a> { - PageCachedBlobIo { - all_keys_iter: VecIter<'a>, - }, - StreamingKmergeBypassingPageCache { - merge_iter: MergeIterator<'a>, - }, - ValidatingStreamingKmergeBypassingPageCache { - mode: CompactL0BypassPageCacheValidation, - merge_iter: MergeIterator<'a>, - all_keys_iter: VecIter<'a>, - }, - } - type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes - impl AllValuesIter<'_> { - async fn next_all_keys_iter( - iter: &mut VecIter<'_>, - ctx: &RequestContext, - ) -> anyhow::Result> { - let Some(DeltaEntry { - key, - lsn, - val: value_ref, - .. - }) = iter.next() - else { - return Ok(None); - }; - let value = value_ref.load(ctx).await?; - Ok(Some((*key, *lsn, value))) - } - async fn next( - &mut self, - ctx: &RequestContext, - ) -> anyhow::Result> { - match self { - AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => { - Self::next_all_keys_iter(iter, ctx).await - } - AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await, - AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async { - // advance both iterators - let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await; - let merge_iter_item = merge_iter.next().await; - // compare results & log warnings as needed - macro_rules! rate_limited_warn { - ($($arg:tt)*) => {{ - if cfg!(debug_assertions) || cfg!(feature = "testing") { - warn!($($arg)*); - panic!("CompactL0BypassPageCacheValidation failure, check logs"); - } - use once_cell::sync::Lazy; - use utils::rate_limit::RateLimit; - use std::sync::Mutex; - use std::time::Duration; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!($($arg)*); - }); - }} - } - match (&all_keys_iter_item, &merge_iter_item) { - (Err(_), Err(_)) => { - // don't bother asserting equivality of the errors - } - (Err(all_keys), Ok(merge)) => { - rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}"); - }, - (Ok(all_keys), Err(merge)) => { - rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}"); - }, - (Ok(None), Ok(None)) => { } - (Ok(Some(all_keys)), Ok(None)) => { - rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some"); - } - (Ok(None), Ok(Some(merge))) => { - rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some"); - } - (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => { - match mode { - // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one - CompactL0BypassPageCacheValidation::KeyLsn => { - let all_keys = (all_keys_key, all_keys_lsn); - let merge = (merge_key, merge_lsn); - if all_keys != merge { - rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter"); - } - } - CompactL0BypassPageCacheValidation::KeyLsnValue => { - let all_keys = (all_keys_key, all_keys_lsn, all_keys_value); - let merge = (merge_key, merge_lsn, merge_value); - if all_keys != merge { - rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter"); - } - } - } - } - } - // in case of mismatch, trust the legacy all_keys_iter_item - all_keys_iter_item - }.instrument(info_span!("next")).await - } - } - } - let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access { - CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo { - all_keys_iter: all_keys.iter(), - }, - CompactL0Phase1ValueAccess::StreamingKmerge { validate } => { - let merge_iter = { - let mut deltas = Vec::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact.iter() { - let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; - deltas.push(l); - } - MergeIterator::create(&deltas, &[], ctx) - }; - match validate { - None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter }, - Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { - mode: validate.clone(), - merge_iter, - all_keys_iter: all_keys.iter(), - }, - } + let mut all_values_iter = { + let mut deltas = Vec::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact.iter() { + let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; + deltas.push(l); } + MergeIterator::create(&deltas, &[], ctx) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -1051,7 +996,7 @@ impl Timeline { let mut keys = 0; while let Some((key, lsn, value)) = all_values_iter - .next(ctx) + .next() .await .map_err(CompactionError::Other)? { @@ -1368,43 +1313,6 @@ impl TryFrom for CompactLevel0Phase1Stats { } } -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] -pub enum CompactL0Phase1ValueAccess { - /// The old way. - PageCachedBlobIo, - /// The new way. - StreamingKmerge { - /// If set, we run both the old way and the new way, validate that - /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), - /// and if the validation fails, - /// - in tests: fail them with a panic or - /// - in prod, log a rate-limited warning and use the old way's results. - /// - /// If not set, we only run the new way and trust its results. - validate: Option, - }, -} - -/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(rename_all = "kebab-case")] -pub enum CompactL0BypassPageCacheValidation { - /// Validate that the series of (key, lsn) pairs are the same. - KeyLsn, - /// Validate that the entire output of old and new way is identical. - KeyLsnValue, -} - -impl Default for CompactL0Phase1ValueAccess { - fn default() -> Self { - CompactL0Phase1ValueAccess::StreamingKmerge { - // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident - validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), - } - } -} - impl Timeline { /// Entry point for new tiered compaction algorithm. /// @@ -1814,11 +1722,27 @@ impl Timeline { } let mut selected_layers = Vec::new(); drop(gc_info); + // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers. + let Some(max_layer_lsn) = layers + .iter_historic_layers() + .filter(|desc| desc.get_lsn_range().start <= gc_cutoff) + .map(|desc| desc.get_lsn_range().end) + .max() + else { + info!("no layers to compact with gc"); + return Ok(()); + }; + // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key + // layers to compact. for desc in layers.iter_historic_layers() { - if desc.get_lsn_range().start <= gc_cutoff { + if desc.get_lsn_range().end <= max_layer_lsn { selected_layers.push(guard.get_from_desc(&desc)); } } + if selected_layers.is_empty() { + info!("no layers to compact with gc"); + return Ok(()); + } retain_lsns_below_horizon.sort(); (selected_layers, gc_cutoff, retain_lsns_below_horizon) }; @@ -1848,27 +1772,52 @@ impl Timeline { lowest_retain_lsn ); // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. - // Also, collect the layer information to decide when to split the new delta layers. - let mut downloaded_layers = Vec::new(); - let mut delta_split_points = BTreeSet::new(); + // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) for layer in &layer_selection { - let resident_layer = layer.download_and_keep_resident().await?; - downloaded_layers.push(resident_layer); - let desc = layer.layer_desc(); if desc.is_delta() { - // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon) - // so that we can avoid having too many small delta layers. - let key_range = desc.get_key_range(); - delta_split_points.insert(key_range.start); - delta_split_points.insert(key_range.end); + // ignore single-key layer files + if desc.key_range.start.next() != desc.key_range.end { + let lsn_range = &desc.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } stat.visit_delta_layer(desc.file_size()); } else { stat.visit_image_layer(desc.file_size()); } } + for layer in &layer_selection { + let desc = layer.layer_desc(); + let key_range = &desc.key_range; + if desc.is_delta() && key_range.start.next() != key_range.end { + let lsn_range = desc.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + bail!( + "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + desc.key(), + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + } + } + } + // The maximum LSN we are processing in this compaction loop + let end_lsn = layer_selection + .iter() + .map(|l| l.layer_desc().lsn_range.end) + .max() + .unwrap(); + // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized + // as an L0 layer. let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); + let mut downloaded_layers = Vec::new(); + for layer in &layer_selection { + let resident_layer = layer.download_and_keep_resident().await?; + downloaded_layers.push(resident_layer); + } for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { let layer = resident_layer.get_as_delta(ctx).await?; @@ -1884,138 +1833,17 @@ impl Timeline { let mut accumulated_values = Vec::new(); let mut last_key: Option = None; - enum FlushDeltaResult { - /// Create a new resident layer - CreateResidentLayer(ResidentLayer), - /// Keep an original delta layer - KeepLayer(PersistentLayerKey), - } - - #[allow(clippy::too_many_arguments)] - async fn flush_deltas( - deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>, - last_key: Key, - delta_split_points: &[Key], - current_delta_split_point: &mut usize, - tline: &Arc, - lowest_retain_lsn: Lsn, - ctx: &RequestContext, - stats: &mut CompactionStatistics, - dry_run: bool, - last_batch: bool, - ) -> anyhow::Result> { - // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid - // overlapping layers. - // - // If we have a structure like this: - // - // | Delta 1 | | Delta 4 | - // |---------| Delta 2 |---------| - // | Delta 3 | | Delta 5 | - // - // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4. - // A simple solution here is to split the delta layers using the original boundary, while this - // might produce a lot of small layers. This should be improved and fixed in the future. - let mut need_split = false; - while *current_delta_split_point < delta_split_points.len() - && last_key >= delta_split_points[*current_delta_split_point] - { - *current_delta_split_point += 1; - need_split = true; - } - if !need_split && !last_batch { - return Ok(None); - } - let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas); - if deltas.is_empty() { - return Ok(None); - } - let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1; - let delta_key = PersistentLayerKey { - key_range: { - let key_start = deltas.first().unwrap().0; - let key_end = deltas.last().unwrap().0.next(); - key_start..key_end - }, - lsn_range: lowest_retain_lsn..end_lsn, - is_delta: true, - }; - { - // Hack: skip delta layer if we need to produce a layer of a same key-lsn. - // - // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range. - // For example, consider the case where a single delta with range [0x10,0x50) exists. - // And we have branches at LSN 0x10, 0x20, 0x30. - // Then we delete branch @ 0x20. - // Bottom-most compaction may now delete the delta [0x20,0x30). - // And that wouldnt' change the shape of the layer. - // - // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes. - // That's why it's safe to skip. - let guard = tline.layers.read().await; - - if guard.contains_key(&delta_key) { - let layer_generation = guard.get_from_key(&delta_key).metadata().generation; - drop(guard); - if layer_generation == tline.generation { - stats.discard_delta_layer(); - // TODO: depending on whether we design this compaction process to run along with - // other compactions, there could be layer map modifications after we drop the - // layer guard, and in case it creates duplicated layer key, we will still error - // in the end. - info!( - key=%delta_key, - ?layer_generation, - "discard delta layer due to duplicated layer in the same generation" - ); - return Ok(Some(FlushDeltaResult::KeepLayer(delta_key))); - } - } - } - - let mut delta_layer_writer = DeltaLayerWriter::new( - tline.conf, - tline.timeline_id, - tline.tenant_shard_id, - delta_key.key_range.start, - lowest_retain_lsn..end_lsn, - ctx, - ) - .await?; - for (key, lsn, val) in deltas { - delta_layer_writer.put_value(key, lsn, val, ctx).await?; - } - - stats.produce_delta_layer(delta_layer_writer.size()); - if dry_run { - return Ok(None); - } - - let (desc, path) = delta_layer_writer - .finish(delta_key.key_range.end, ctx) - .await?; - let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?; - Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer))) - } - - // Hack the key range to be min..(max-1). Otherwise, the image layer will be - // interpreted as an L0 delta layer. - let hack_image_layer_range = { - let mut end_key = Key::MAX; - end_key.field6 -= 1; - Key::MIN..end_key - }; - // Only create image layers when there is no ancestor branches. TODO: create covering image layer // when some condition meet. let mut image_layer_writer = if self.ancestor_timeline.is_none() { Some( - ImageLayerWriter::new( + SplitImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - &hack_image_layer_range, // covers the full key range + Key::MIN, lowest_retain_lsn, + self.get_compaction_target_size(), ctx, ) .await?, @@ -2024,6 +1852,15 @@ impl Timeline { None }; + let mut delta_layer_writer = SplitDeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + lowest_retain_lsn..end_lsn, + self.get_compaction_target_size(), + ) + .await?; + /// Returns None if there is no ancestor branch. Throw an error when the key is not found. /// /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image @@ -2044,47 +1881,11 @@ impl Timeline { let img = tline.get(key, tline.ancestor_lsn, ctx).await?; Ok(Some((key, tline.ancestor_lsn, img))) } - let image_layer_key = PersistentLayerKey { - key_range: hack_image_layer_range, - lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn), - is_delta: false, - }; - - // Like with delta layers, it can happen that we re-produce an already existing image layer. - // This could happen when a user triggers force compaction and image generation. In this case, - // it's always safe to rewrite the layer. - let discard_image_layer = { - let guard = self.layers.read().await; - if guard.contains_key(&image_layer_key) { - let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation; - drop(guard); - if layer_generation == self.generation { - // TODO: depending on whether we design this compaction process to run along with - // other compactions, there could be layer map modifications after we drop the - // layer guard, and in case it creates duplicated layer key, we will still error - // in the end. - info!( - key=%image_layer_key, - ?layer_generation, - "discard image layer due to duplicated layer key in the same generation", - ); - true - } else { - false - } - } else { - false - } - }; // Actually, we can decide not to write to the image layer at all at this point because // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - let mut delta_values = Vec::new(); - let delta_split_points = delta_split_points.into_iter().collect_vec(); - let mut current_delta_split_point = 0; - let mut delta_layers = Vec::new(); while let Some((key, lsn, val)) = merge_iter.next().await? { if cancel.is_cancelled() { return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error @@ -2115,27 +1916,14 @@ impl Timeline { retention .pipe_to( *last_key, - &mut delta_values, + self, + &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, + dry_run, ctx, ) .await?; - delta_layers.extend( - flush_deltas( - &mut delta_values, - *last_key, - &delta_split_points, - &mut current_delta_split_point, - self, - lowest_retain_lsn, - ctx, - &mut stat, - dry_run, - false, - ) - .await?, - ); accumulated_values.clear(); *last_key = key; accumulated_values.push((key, lsn, val)); @@ -2159,43 +1947,75 @@ impl Timeline { retention .pipe_to( last_key, - &mut delta_values, + self, + &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, + dry_run, ctx, ) .await?; - delta_layers.extend( - flush_deltas( - &mut delta_values, - last_key, - &delta_split_points, - &mut current_delta_split_point, - self, - lowest_retain_lsn, - ctx, - &mut stat, - dry_run, - true, - ) - .await?, - ); - assert!(delta_values.is_empty(), "unprocessed keys"); - let image_layer = if discard_image_layer { - stat.discard_image_layer(); - None - } else if let Some(writer) = image_layer_writer { - stat.produce_image_layer(writer.size()); + let discard = |key: &PersistentLayerKey| { + let key = key.clone(); + async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await } + }; + + let produced_image_layers = if let Some(writer) = image_layer_writer { if !dry_run { - Some(writer.finish(self, ctx).await?) + writer + .finish_with_discard_fn(self, ctx, Key::MAX, discard) + .await? } else { - None + let (layers, _) = writer.take()?; + assert!(layers.is_empty(), "image layers produced in dry run mode?"); + Vec::new() } } else { - None + Vec::new() }; + let produced_delta_layers = if !dry_run { + delta_layer_writer + .finish_with_discard_fn(self, ctx, discard) + .await? + } else { + let (layers, _) = delta_layer_writer.take()?; + assert!(layers.is_empty(), "delta layers produced in dry run mode?"); + Vec::new() + }; + + let mut compact_to = Vec::new(); + let mut keep_layers = HashSet::new(); + let produced_delta_layers_len = produced_delta_layers.len(); + let produced_image_layers_len = produced_image_layers.len(); + for action in produced_delta_layers { + match action { + SplitWriterResult::Produced(layer) => { + stat.produce_delta_layer(layer.layer_desc().file_size()); + compact_to.push(layer); + } + SplitWriterResult::Discarded(l) => { + keep_layers.insert(l); + stat.discard_delta_layer(); + } + } + } + for action in produced_image_layers { + match action { + SplitWriterResult::Produced(layer) => { + stat.produce_image_layer(layer.layer_desc().file_size()); + compact_to.push(layer); + } + SplitWriterResult::Discarded(l) => { + keep_layers.insert(l); + stat.discard_image_layer(); + } + } + } + let mut layer_selection = layer_selection; + layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + info!( "gc-compaction statistics: {}", serde_json::to_string(&stat)? @@ -2206,28 +2026,11 @@ impl Timeline { } info!( - "produced {} delta layers and {} image layers", - delta_layers.len(), - if image_layer.is_some() { 1 } else { 0 } + "produced {} delta layers and {} image layers, {} layers are kept", + produced_delta_layers_len, + produced_image_layers_len, + layer_selection.len() ); - let mut compact_to = Vec::new(); - let mut keep_layers = HashSet::new(); - for action in delta_layers { - match action { - FlushDeltaResult::CreateResidentLayer(layer) => { - compact_to.push(layer); - } - FlushDeltaResult::KeepLayer(l) => { - keep_layers.insert(l); - } - } - } - if discard_image_layer { - keep_layers.insert(image_layer_key); - } - let mut layer_selection = layer_selection; - layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); - compact_to.extend(image_layer); // Step 3: Place back to the layer map. { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index b5c577af72..cee259e2e0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -27,11 +27,11 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr::TaskKind, - task_mgr::WALRECEIVER_RUNTIME, + pgdatadir_mapping::DatadirModification, + task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, - walrecord::DecodedWALRecord, + walrecord::{decode_wal_record, DecodedWALRecord}, }; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; @@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); { - let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(startlsn); let mut uncommitted_records = 0; let mut filtered_records = 0; + + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } + // Deserialize WAL record + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?; + + if decoded.is_dbase_create_copy(timeline.pg_version) + && uncommitted_records > 0 + { + // Special case: legacy PG database creations operate by reading pages from a 'template' database: + // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure + // all earlier writes of data blocks are visible by committing any modification in flight. + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + // Ingest the records without immediately committing them. let ingested = walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; if !ingested { @@ -345,22 +379,29 @@ pub(super) async fn handle_walreceiver_connection( // Commit every ingest_batch_size records. Even if we filtered out // all records, we still need to call commit to advance the LSN. uncommitted_records += 1; - if uncommitted_records >= ingest_batch_size { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; - uncommitted_records = 0; - filtered_records = 0; + if uncommitted_records >= ingest_batch_size + || modification.approx_pending_bytes() + > DatadirModification::MAX_PENDING_BYTES + { + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } // Commit the remaining records. if uncommitted_records > 0 { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 54a3ad789b..553edf6d8b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -16,7 +16,6 @@ //! Note that the vectored blob api does *not* go through the page cache. use std::collections::BTreeMap; -use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; @@ -27,10 +26,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::VirtualFile; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct MaxVectoredReadBytes(pub NonZeroUsize); +use crate::virtual_file::{self, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] @@ -60,7 +56,7 @@ pub struct VectoredBlobsBuf { pub struct VectoredRead { pub start: u64, pub end: u64, - /// Starting offsets and metadata for each blob in this read + /// Start offset and metadata for each blob in this read pub blobs_at: VecMap, } @@ -76,14 +72,109 @@ pub(crate) enum VectoredReadExtended { No, } -pub(crate) struct VectoredReadBuilder { +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum VectoredReadCoalesceMode { + /// Only coalesce exactly adjacent reads. + AdjacentOnly, + /// In addition to adjacent reads, also consider reads whose corresponding + /// `end` and `start` offsets reside at the same chunk. + Chunked(usize), +} + +impl VectoredReadCoalesceMode { + /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0, + /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher. + pub(crate) fn get() -> Self { + let align = virtual_file::get_io_buffer_alignment_raw(); + if align == 0 { + VectoredReadCoalesceMode::AdjacentOnly + } else { + VectoredReadCoalesceMode::Chunked(align) + } + } +} + +pub(crate) enum VectoredReadBuilder { + Adjacent(AdjacentVectoredReadBuilder), + Chunked(ChunkedVectoredReadBuilder), +} + +impl VectoredReadBuilder { + fn new_impl( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + mode: VectoredReadCoalesceMode, + ) -> Self { + match mode { + VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent( + AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size), + ), + VectoredReadCoalesceMode::Chunked(chunk_size) => { + Self::Chunked(ChunkedVectoredReadBuilder::new( + start_offset, + end_offset, + meta, + max_read_size, + chunk_size, + )) + } + } + } + + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode) + } + + pub(crate) fn new_streaming( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, None, mode) + } + + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta), + VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta), + } + } + + pub(crate) fn build(self) -> VectoredRead { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.build(), + VectoredReadBuilder::Chunked(builder) => builder.build(), + } + } + + pub(crate) fn size(&self) -> usize { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.size(), + VectoredReadBuilder::Chunked(builder) => builder.size(), + } + } +} + +pub(crate) struct AdjacentVectoredReadBuilder { + /// Start offset of the read. start: u64, + // End offset of the read. end: u64, + /// Start offset and metadata for each blob in this read blobs_at: VecMap, max_read_size: Option, } -impl VectoredReadBuilder { +impl AdjacentVectoredReadBuilder { /// Start building a new vectored read. /// /// Note that by design, this does not check against reading more than `max_read_size` to @@ -93,7 +184,7 @@ impl VectoredReadBuilder { start_offset: u64, end_offset: u64, meta: BlobMeta, - max_read_size: usize, + max_read_size: Option, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at @@ -104,7 +195,7 @@ impl VectoredReadBuilder { start: start_offset, end: end_offset, blobs_at, - max_read_size: Some(max_read_size), + max_read_size, } } /// Attempt to extend the current read with a new blob if the start @@ -113,13 +204,15 @@ impl VectoredReadBuilder { pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let size = (end - start) as usize; - if self.end == start && { + let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { self.size() + size <= max_read_size } else { true } - } { + }; + + if self.end == start && not_limited_by_max_read_size { self.end = end; self.blobs_at .append(start, meta) @@ -144,6 +237,107 @@ impl VectoredReadBuilder { } } +pub(crate) struct ChunkedVectoredReadBuilder { + /// Start block number + start_blk_no: usize, + /// End block number (exclusive). + end_blk_no: usize, + /// Start offset and metadata for each blob in this read + blobs_at: VecMap, + max_read_size: Option, + /// Chunk size reads are coalesced into. + chunk_size: usize, +} + +/// Computes x / d rounded up. +fn div_round_up(x: usize, d: usize) -> usize { + (x + (d - 1)) / d +} + +impl ChunkedVectoredReadBuilder { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + chunk_size: usize, + ) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + let start_blk_no = start_offset as usize / chunk_size; + let end_blk_no = div_round_up(end_offset as usize, chunk_size); + Self { + start_blk_no, + end_blk_no, + blobs_at, + max_read_size, + chunk_size, + } + } + + /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. + /// + /// The resulting size also must be below the max read size. + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); + let start_blk_no = start as usize / self.chunk_size; + let end_blk_no = div_round_up(end as usize, self.chunk_size); + + let not_limited_by_max_read_size = { + if let Some(max_read_size) = self.max_read_size { + let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size; + coalesced_size <= max_read_size + } else { + true + } + }; + + // True if the second block starts in the same block or the immediate next block where the first block ended. + // + // Note: This automatically handles the case where two blocks are adjacent to each other, + // whether they starts on chunk size boundary or not. + let is_adjacent_chunk_read = { + // 1. first.end & second.start are in the same block + self.end_blk_no == start_blk_no + 1 || + // 2. first.end ends one block before second.start + self.end_blk_no == start_blk_no + }; + + if is_adjacent_chunk_read && not_limited_by_max_read_size { + self.end_blk_no = end_blk_no; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + pub(crate) fn size(&self) -> usize { + (self.end_blk_no - self.start_blk_no) * self.chunk_size + } + + pub(crate) fn build(self) -> VectoredRead { + let start = (self.start_blk_no * self.chunk_size) as u64; + let end = (self.end_blk_no * self.chunk_size) as u64; + VectoredRead { + start, + end, + blobs_at: self.blobs_at, + } + } +} + #[derive(Copy, Clone, Debug)] pub enum BlobFlag { None, @@ -166,14 +360,18 @@ pub struct VectoredReadPlanner { prev: Option<(Key, Lsn, u64, BlobFlag)>, max_read_size: usize, + + mode: VectoredReadCoalesceMode, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { + let mode = VectoredReadCoalesceMode::get(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, + mode, } } @@ -252,6 +450,7 @@ impl VectoredReadPlanner { end_offset, BlobMeta { key, lsn }, self.max_read_size, + self.mode, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -303,6 +502,18 @@ impl<'a> VectoredBlobReader<'a> { read.size(), buf.capacity() ); + + if cfg!(debug_assertions) { + let align = virtual_file::get_io_buffer_alignment() as u64; + debug_assert_eq!( + read.start % align, + 0, + "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", + read.start, + align + ); + } + let mut buf = self .file .read_exact_at(buf.slice(0..read.size()), read.start, ctx) @@ -310,27 +521,20 @@ impl<'a> VectoredBlobReader<'a> { .into_inner(); let blobs_at = read.blobs_at.as_slice(); - let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; + + let start_offset = read.start; let mut metas = Vec::with_capacity(blobs_at.len()); - // Blobs in `read` only provide their starting offset. The end offset // of a blob is implicit: the start of the next blob if one exists // or the end of the read. - let pairs = blobs_at.iter().zip( - blobs_at - .iter() - .map(Some) - .skip(1) - .chain(std::iter::once(None)), - ); // Some scratch space, put here for reusing the allocation let mut decompressed_vec = Vec::new(); - for ((offset, meta), next) in pairs { - let offset_in_buf = offset - start_offset; - let first_len_byte = buf[offset_in_buf as usize]; + for (blob_start, meta) in blobs_at { + let blob_start_in_buf = blob_start - start_offset; + let first_len_byte = buf[blob_start_in_buf as usize]; // Each blob is prefixed by a header containing its size and compression information. // Extract the size and skip that header to find the start of the data. @@ -340,7 +544,7 @@ impl<'a> VectoredBlobReader<'a> { (1, first_len_byte as u64, BYTE_UNCOMPRESSED) } else { let mut blob_size_buf = [0u8; 4]; - let offset_in_buf = offset_in_buf as usize; + let offset_in_buf = blob_start_in_buf as usize; blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK; @@ -353,12 +557,8 @@ impl<'a> VectoredBlobReader<'a> { ) }; - let start_raw = offset_in_buf + size_length; - let end_raw = match next { - Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, - None => start_raw + blob_size, - }; - assert_eq!(end_raw - start_raw, blob_size); + let start_raw = blob_start_in_buf + size_length; + let end_raw = start_raw + blob_size; let (start, end); if compression_bits == BYTE_UNCOMPRESSED { start = start_raw as usize; @@ -393,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> { } } -/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for -/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and +/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. +/// +/// It provides a streaming API for getting read blobs. It returns a batch when +/// `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. pub struct StreamingVectoredReadPlanner { read_builder: Option, @@ -407,18 +609,22 @@ pub struct StreamingVectoredReadPlanner { max_cnt: usize, /// Size of the current batch cnt: usize, + + mode: VectoredReadCoalesceMode, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); + let mode = VectoredReadCoalesceMode::get(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, + mode, } } @@ -467,17 +673,12 @@ impl StreamingVectoredReadPlanner { } None => { self.read_builder = { - let mut blobs_at = VecMap::default(); - blobs_at - .append(start_offset, BlobMeta { key, lsn }) - .expect("First insertion always succeeds"); - - Some(VectoredReadBuilder { - start: start_offset, - end: end_offset, - blobs_at, - max_read_size: None, - }) + Some(VectoredReadBuilder::new_streaming( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.mode, + )) }; } } @@ -511,7 +712,9 @@ mod tests { use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { - assert_eq!(read.start, offset_range.first().unwrap().2); + let align = virtual_file::get_io_buffer_alignment() as u64; + assert_eq!(read.start % align, 0); + assert_eq!(read.start / align, offset_range.first().unwrap().2 / align); let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); @@ -525,6 +728,68 @@ mod tests { assert_eq!(expected_offsets_in_read, offsets_in_read); } + #[test] + fn planner_chunked_coalesce_all_test() { + use crate::virtual_file; + + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + + // The test explicitly does not check chunk size < 512 + if chunk_size < 512 { + return; + } + + let max_read_size = chunk_size as usize * 8; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = [ + (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN + (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size / 2, BlobFlag::None), + (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size, BlobFlag::None), + (key, lsn, chunk_size * 2 - 1, BlobFlag::None), + (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size * 3 + 1, BlobFlag::None), + (key, lsn, chunk_size * 5 + 1, BlobFlag::None), + (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. + (key, lsn, chunk_size * 7 + 1, BlobFlag::None), + (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) + (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk + (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) + ]; + + let ranges = [ + &[ + blob_descriptions[0], + blob_descriptions[2], + blob_descriptions[4], + blob_descriptions[5], + blob_descriptions[7], + blob_descriptions[8], + blob_descriptions[10], + ], + &blob_descriptions[11..12], + &blob_descriptions[13..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + #[test] fn planner_max_read_size_test() { let max_read_size = 128 * 1024; @@ -571,18 +836,19 @@ mod tests { #[test] fn planner_replacement_test() { - let max_read_size = 128 * 1024; + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + let max_read_size = 128 * chunk_size as usize; let first_key = Key::MIN; let second_key = first_key.next(); let lsn = Lsn(0); let blob_descriptions = vec![ - (first_key, lsn, 0, BlobFlag::None), // First in read 1 - (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 - (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll), - (second_key, lsn, 3 * 1024, BlobFlag::None), - (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2 - (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * chunk_size, BlobFlag::None), + (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * chunk_size, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; @@ -592,7 +858,7 @@ mod tests { planner.handle(key, lsn, offset, flag); } - planner.handle_range_end(6 * 1024); + planner.handle_range_end(6 * chunk_size); let reads = planner.finish(); assert_eq!(reads.len(), 2); @@ -737,6 +1003,7 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); + let mode = VectoredReadCoalesceMode::get(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -748,7 +1015,7 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); + let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); @@ -784,4 +1051,12 @@ mod tests { round_trip_test_compressed(&blobs, true).await?; Ok(()) } + + #[test] + fn test_div_round_up() { + const CHUNK_SIZE: usize = 512; + assert_eq!(1, div_round_up(200, CHUNK_SIZE)); + assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE)); + assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE)); + } } diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 3c48c84598..a0223f3bce 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -9,7 +9,7 @@ use utils::serde_percent::Percent; use pageserver_api::models::PageserverUtilization; -use crate::{config::PageServerConf, tenant::mgr::TenantManager}; +use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; pub(crate) fn regenerate( conf: &PageServerConf, @@ -58,13 +58,13 @@ pub(crate) fn regenerate( disk_usable_pct, shard_count, max_shard_count: MAX_SHARDS, - utilization_score: 0, + utilization_score: None, captured_at: utils::serde_system_time::SystemTime(captured_at), }; - doc.refresh_score(); - - // TODO: make utilization_score into a metric + // Initialize `PageserverUtilization::utilization_score` + let score = doc.cached_score(); + NODE_UTILIZATION_SCORE.set(score); Ok(doc) } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c0017280fd..57856eea80 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1,6 +1,7 @@ -//! //! VirtualFile is like a normal File, but it's not bound directly to -//! a file descriptor. Instead, the file is opened when it's read from, +//! a file descriptor. +//! +//! Instead, the file is opened when it's read from, //! and if too many files are open globally in the system, least-recently //! used ones are closed. //! @@ -18,6 +19,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::io_buf_ext::FullSlice; +use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; @@ -1140,10 +1142,13 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind) { +pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + if set_io_buffer_alignment(io_buffer_alignment).is_err() { + panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two"); + } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1167,6 +1172,53 @@ fn get_open_files() -> &'static OpenFiles { } } +static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); + +/// Returns true if `x` is zero or a power of two. +fn is_zero_or_power_of_two(x: usize) -> bool { + (x == 0) || ((x & (x - 1)) == 0) +} + +#[allow(unused)] +pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { + if is_zero_or_power_of_two(align) { + IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); + Ok(()) + } else { + Err(align) + } +} + +/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified. +/// +/// This function should be used to check the raw config value. +pub(crate) fn get_io_buffer_alignment_raw() -> usize { + let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); + + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; + if let Some(test_align) = utils::env::var(env_var_name) { + if is_zero_or_power_of_two(test_align) { + test_align + } else { + panic!("IO buffer alignment ({test_align}) is not a power of two"); + } + } else { + align + } + } else { + align + } +} + +/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero. +/// +/// This function should be used for getting the actual alignment value to use. +pub(crate) fn get_io_buffer_alignment() -> usize { + let align = get_io_buffer_alignment_raw(); + align.max(1) +} + #[cfg(test)] mod tests { use crate::context::DownloadBehavior; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index faef1ba9ff..ccde90ee1a 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine { } }, Err(std::env::VarError::NotPresent) => { - crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE - .parse() - .unwrap() + #[cfg(target_os = "linux")] + { + IoEngineKind::TokioEpollUring + } + #[cfg(not(target_os = "linux"))] + { + IoEngineKind::StdFs + } } Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {env_var_name} is not unicode"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f8f37b17e3..568cf62e56 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -78,6 +78,7 @@ where .expect("must not use after we returned an error") } + /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered( &mut self, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 8425528740..229c01a681 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,19 +21,23 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::time::Duration; +use std::time::SystemTime; + use pageserver_api::shard::ShardIdentity; -use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use utils::failpoint_support; +use utils::rate_limit::RateLimit; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; @@ -42,17 +46,40 @@ use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +enum_pgversion! {CheckPoint, pgv::CheckPoint} + +impl CheckPoint { + fn encode(&self) -> Result { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() }) + } + + fn update_next_xid(&mut self, xid: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) }) + } + + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { + cp.update_next_multixid(multi_xid, multi_offset) + }) + } +} + pub struct WalIngest { shard: ShardIdentity, checkpoint: CheckPoint, checkpoint_modified: bool, + warn_ingest_lag: WarnIngestLag, +} + +struct WarnIngestLag { + lag_msg_ratelimit: RateLimit, + future_lsn_msg_ratelimit: RateLimit, + timestamp_invalid_msg_ratelimit: RateLimit, } impl WalIngest { @@ -64,13 +91,23 @@ impl WalIngest { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; - let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + let pgversion = timeline.pg_version; + + let checkpoint = dispatch_pgversion!(pgversion, { + let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + >::into(checkpoint) + }); Ok(WalIngest { shard: *timeline.get_shard_identity(), checkpoint, checkpoint_modified: false, + warn_ingest_lag: WarnIngestLag { + lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + }, }) } @@ -86,10 +123,9 @@ impl WalIngest { /// pub async fn ingest_record( &mut self, - recdata: Bytes, + decoded: DecodedWALRecord, lsn: Lsn, modification: &mut DatadirModification<'_>, - decoded: &mut DecodedWALRecord, ctx: &RequestContext, ) -> anyhow::Result { WAL_INGEST.records_received.inc(); @@ -97,7 +133,12 @@ impl WalIngest { let prev_len = modification.len(); modification.set_lsn(lsn)?; - decode_wal_record(recdata, decoded, pg_version)?; + + if decoded.is_dbase_create_copy(pg_version) { + // Records of this type should always be preceded by a commit(), as they + // rely on reading data pages back from the Timeline. + assert!(!modification.has_dirty_data_pages()); + } let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -115,11 +156,11 @@ impl WalIngest { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages // without registering them with the standard mechanism. - self.ingest_heapam_record(&mut buf, modification, decoded, ctx) + self.ingest_heapam_record(&mut buf, modification, &decoded, ctx) .await?; } pg_constants::RM_NEON_ID => { - self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx) .await?; } // Handle other special record types @@ -196,6 +237,26 @@ impl WalIngest { .await?; } } + } else if pg_version == 17 { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } pg_constants::RM_TBLSPC_ID => { @@ -205,7 +266,11 @@ impl WalIngest { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -219,7 +284,7 @@ impl WalIngest { .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); + let xlrec = XlClogTruncate::decode(&mut buf, pg_version); self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } @@ -258,12 +323,21 @@ impl WalIngest { parsed_xact.xid, lsn, ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) - .await?; + + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(parsed_xact.xid)? + } else { + parsed_xact.xid as u64 + }; + modification.drop_twophase_file(xid, ctx).await?; } else if info == pg_constants::XLOG_XACT_PREPARE { + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(decoded.xl_xid)? + } else { + decoded.xl_xid as u64 + }; modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } @@ -271,7 +345,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -284,7 +362,11 @@ impl WalIngest { ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -307,76 +389,99 @@ impl WalIngest { } pg_constants::RM_RELMAP_ID => { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded, ctx) + self.ingest_relmap_page(modification, &xlrec, &decoded, ctx) .await?; } pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - if info == pg_constants::XLOG_NEXTOID { - let next_oid = buf.get_u32_le(); - if self.checkpoint.nextOid != next_oid { - self.checkpoint.nextOid = next_oid; + if info == pg_constants::XLOG_PARAMETER_CHANGE { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlParameterChange::decode(&mut buf); + cp.wal_level = rec.wal_level; self.checkpoint_modified = true; } - } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE - || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; - buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!( - "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - self.checkpoint.oldestXid - ); - if (self - .checkpoint - .oldestXid - .wrapping_sub(xlog_checkpoint.oldestXid) as i32) - < 0 - { - self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; + } else if info == pg_constants::XLOG_END_OF_RECOVERY { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlEndOfRecovery::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; } - trace!( - "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - self.checkpoint.oldestActiveXid - ); - - // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, - // because at shutdown, all in-progress transactions will implicitly - // end. Postgres startup code knows that, and allows hot standby to start - // immediately from a shutdown checkpoint. - // - // In Neon, Postgres hot standby startup always behaves as if starting from - // an online checkpoint. It needs a valid `oldestActiveXid` value, so - // instead of overwriting self.checkpoint.oldestActiveXid with - // InvalidTransactionid from the checkpoint WAL record, update it to a - // proper value, knowing that there are no in-progress transactions at this - // point, except for prepared transactions. - // - // See also the neon code changes in the InitWalRecovery() function. - if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID - && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; - } - } - self.checkpoint.oldestActiveXid = oldest_active_xid; - } else { - self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; - } - - // Write a new checkpoint key-value pair on every checkpoint record, even - // if nothing really changed. Not strictly required, but it seems nice to - // have some trace of the checkpoint records in the layer files at the same - // LSNs. - self.checkpoint_modified = true; } + + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + if info == pg_constants::XLOG_NEXTOID { + let next_oid = buf.get_u32_le(); + if cp.nextOid != next_oid { + cp.nextOid = next_oid; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE + || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT]; + buf.copy_to_slice(&mut checkpoint_bytes); + let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!( + "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", + xlog_checkpoint.oldestXid, + cp.oldestXid + ); + if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { + cp.oldestXid = xlog_checkpoint.oldestXid; + } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + cp.oldestActiveXid + ); + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let oldest_active_xid = if pg_version >= 17 { + let mut oldest_active_full_xid = cp.nextXid.value; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if xid < oldest_active_full_xid { + oldest_active_full_xid = xid; + } + } + oldest_active_full_xid as u32 + } else { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + let narrow_xid = xid as u32; + if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = narrow_xid; + } + } + oldest_active_xid + }; + cp.oldestActiveXid = oldest_active_xid; + } else { + cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + } + + // Write a new checkpoint key-value pair on every checkpoint record, even + // if nothing really changed. Not strictly required, but it seems nice to + // have some trace of the checkpoint records in the layer files at the same + // LSNs. + self.checkpoint_modified = true; + } + }); } pg_constants::RM_LOGICALMSG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -400,7 +505,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); - self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestActiveXid = xlrec.oldest_running_xid; + }); + self.checkpoint_modified = true; } } @@ -452,7 +561,7 @@ impl WalIngest { continue; } - self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) + self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx) .await?; } @@ -468,9 +577,30 @@ impl WalIngest { // until commit() is called to flush the data into the repository and update // the latest LSN. + modification.on_record_end(); + Ok(modification.len() > prev_len) } + /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + let next_full_xid = + enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); + + let next_xid = (next_full_xid) as u32; + let mut epoch = (next_full_xid >> 32) as u32; + + if xid > next_xid { + // Wraparound occurred, must be from a prev epoch. + if epoch == 0 { + bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + } + epoch -= 1; + } + + Ok((epoch as u64) << 32 | xid as u64) + } + /// Do not store this block, but observe it for the purposes of updating our relation size state. async fn observe_decoded_block( &mut self, @@ -513,7 +643,7 @@ impl WalIngest { && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI - || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) + || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data @@ -539,6 +669,7 @@ impl WalIngest { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx) .await?; } else { @@ -770,6 +901,73 @@ impl WalIngest { bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 17 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v17::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v17::XlHeapDelete::decode(buf); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v17::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v17::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v17::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v17::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } _ => {} } @@ -878,26 +1076,26 @@ impl WalIngest { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 => { + 16 | 17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { - let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { - let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. @@ -913,7 +1111,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { @@ -929,7 +1127,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_LOCK => { - let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; @@ -1177,7 +1375,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?; fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1199,7 +1397,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, vm_page_no)?; vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1212,6 +1410,52 @@ impl WalIngest { Ok(()) } + fn warn_on_ingest_lag( + &mut self, + conf: &crate::config::PageServerConf, + wal_timestamp: TimestampTz, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + let now = SystemTime::now(); + let rate_limits = &mut self.warn_ingest_lag; + + let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, { + pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp) + }); + + match ts { + Ok(ts) => { + match now.duration_since(ts) { + Ok(lag) => { + if lag > conf.wait_lsn_timeout { + rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { + let lag = humantime::format_duration(lag); + warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); + }) + } + } + Err(e) => { + let delta_t = e.duration(); + // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) + // => https://www.robustperception.io/time-metric-from-the-node-exporter/ + const IGNORED_DRIFT: Duration = Duration::from_millis(100); + if delta_t > IGNORED_DRIFT { + let delta_t = humantime::format_duration(delta_t); + rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future"); + }) + } + } + }; + } + Err(error) => { + rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries"); + }) + } + } + } + /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// async fn ingest_xact_record( @@ -1228,6 +1472,8 @@ impl WalIngest { let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let mut page_xids: Vec = vec![parsed.xid]; + self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time); + for subxact in &parsed.subxacts { let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE; if subxact_pageno != pageno { @@ -1308,14 +1554,17 @@ impl WalIngest { // truncated, but a checkpoint record with the updated values isn't written until // later. In Neon, a server can start at any LSN, not just on a checkpoint record, // so we keep the oldestXid and oldestXidDB up-to-date. - self.checkpoint.oldestXid = xlrec.oldest_xid; - self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestXid = xlrec.oldest_xid; + cp.oldestXidDB = xlrec.oldest_xid_db; + }); self.checkpoint_modified = true; // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it let latest_page_number = - self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32 + / pg_constants::CLOG_XACTS_PER_PAGE; // Now delete all segments containing pages between xlrec.pageno // and latest_page_number. @@ -1323,7 +1572,9 @@ impl WalIngest { // First, make an important safety check: // the current endpoint page must not be eligible for removal. // See SimpleLruTruncate() in slru.c - if clogpage_precedes(latest_page_number, xlrec.pageno) { + if dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno) + }) { info!("could not truncate directory pg_xact apparent wraparound"); return Ok(()); } @@ -1340,7 +1591,12 @@ impl WalIngest { .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + + let may_delete = dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno) + }); + + if may_delete { modification .drop_slru_segment(SlruKind::Clog, segno, ctx) .await?; @@ -1459,14 +1715,23 @@ impl WalIngest { xlrec: &XlMultiXactTruncate, ctx: &RequestContext, ) -> Result<()> { - self.checkpoint.oldestMulti = xlrec.end_trunc_off; - self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db; + let (maxsegment, startsegment, endsegment) = + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestMulti = xlrec.end_trunc_off; + cp.oldestMultiDB = xlrec.oldest_multi_db; + let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment( + pg_constants::MAX_MULTIXACT_OFFSET, + ); + let startsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb); + let endsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb); + (maxsegment, startsegment, endsegment) + }); + self.checkpoint_modified = true; // PerformMembersTruncation - let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET); - let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb); - let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb); let mut segment: i32 = startsegment; // Delete all the segments except the last one. The last segment can still @@ -1625,7 +1890,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, gap_blknum)?; } } Ok(()) @@ -1691,7 +1956,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + modification.put_slru_page_image_zero(kind, segno, gap_blknum)?; } } Ok(()) @@ -1740,11 +2005,23 @@ mod tests { // TODO } - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + #[tokio::test] + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + for i in 14..=16 { + dispatch_pgversion!(i, { + pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; + }); + } + + Ok(()) + } async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); - m.put_checkpoint(ZERO_CHECKPOINT.clone())?; + m.put_checkpoint(dispatch_pgversion!( + tline.pg_version, + pgv::ZERO_CHECKPOINT.clone() + ))?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file m.commit(ctx).await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; @@ -1765,21 +2042,25 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1921,6 +2202,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -1946,6 +2228,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -2303,6 +2586,9 @@ mod tests { let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); + let span = harness + .span() + .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID)); let (tenant, ctx) = harness.load().await; let remote_initdb_path = @@ -2344,7 +2630,6 @@ mod tests { .await .unwrap(); let mut modification = tline.begin_modification(startpoint); - let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); // Decode and ingest wal. We process the wal in chunks because @@ -2352,8 +2637,11 @@ mod tests { for chunk in bytes[xlogoff..].chunks(50) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap(); walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) + .instrument(span.clone()) .await .unwrap(); } diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index edddcefbe1..dd199e2c55 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -160,6 +160,31 @@ pub struct DecodedWALRecord { pub origin_id: u16, } +impl DecodedWALRecord { + /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations + /// by reading other existing relations' data blocks. This is more complex to apply than new-style database + /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. + pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + if self.xl_rmid == pg_constants::RM_DBASE_ID { + let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + match pg_version { + 14 => { + // Postgres 14 database creations are always the legacy kind + info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + } + 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, + _ => { + panic!("Unsupported postgres version {pg_version}") + } + } + } else { + false + } + } +} + #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct RelFileNode { @@ -317,16 +342,47 @@ pub mod v14 { } } } + + #[repr(C)] + #[derive(Debug)] + pub struct XlParameterChange { + pub max_connections: i32, + pub max_worker_processes: i32, + pub max_wal_senders: i32, + pub max_prepared_xacts: i32, + pub max_locks_per_xact: i32, + pub wal_level: i32, + pub wal_log_hints: bool, + pub track_commit_timestamp: bool, + pub _padding: [u8; 2], + } + + impl XlParameterChange { + pub fn decode(buf: &mut Bytes) -> XlParameterChange { + XlParameterChange { + max_connections: buf.get_i32_le(), + max_worker_processes: buf.get_i32_le(), + max_wal_senders: buf.get_i32_le(), + max_prepared_xacts: buf.get_i32_le(), + max_locks_per_xact: buf.get_i32_le(), + wal_level: buf.get_i32_le(), + wal_log_hints: buf.get_u8() != 0, + track_commit_timestamp: buf.get_u8() != 0, + _padding: [buf.get_u8(), buf.get_u8()], + } + } + } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + XlParameterChange, }; } pub mod v16 { - pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use bytes::{Buf, Bytes}; use postgres_ffi::{OffsetNumber, TransactionId}; @@ -505,6 +561,37 @@ pub mod v16 { } } +pub mod v17 { + pub use super::v14::XlHeapLockUpdated; + use bytes::{Buf, Bytes}; + pub use postgres_ffi::{TimeLineID, TimestampTz}; + + pub use super::v16::rm_neon; + pub use super::v16::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + }; + + #[repr(C)] + #[derive(Debug)] + pub struct XlEndOfRecovery { + pub end_time: TimestampTz, + pub this_time_line_id: TimeLineID, + pub prev_time_line_id: TimeLineID, + pub wal_level: i32, + } + + impl XlEndOfRecovery { + pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { + XlEndOfRecovery { + end_time: buf.get_i64_le(), + this_time_line_id: buf.get_u32_le(), + prev_time_line_id: buf.get_u32_le(), + wal_level: buf.get_i32_le(), + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -722,9 +809,13 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { XlClogTruncate { - pageno: buf.get_u32_le(), + pageno: if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 82585f9ed8..0fe7def8b0 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,6 +35,7 @@ use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; +use std::future::Future; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -43,13 +44,12 @@ use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +/// The real implementation that uses a Postgres process to +/// perform WAL replay. /// -/// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the process at a time, -/// that is controlled by the Mutex. In the future, we might want to -/// launch a pool of processes to allow concurrent replay of multiple -/// records. -/// +/// Only one thread can use the process at a time, that is controlled by the +/// Mutex. In the future, we might want to launch a pool of processes to allow +/// concurrent replay of multiple records. pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, @@ -297,6 +297,97 @@ impl PostgresRedoManager { } } + async fn do_with_walredo_process< + F: FnOnce(Arc) -> Fut, + Fut: Future>, + O, + >( + &self, + pg_version: u32, + closure: F, + ) -> Result { + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); + } + }, + Err(permit) => { + let start = Instant::now(); + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + elapsed_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc + } + }; + + // async closures are unstable, would support &Process + let result = closure(proc.clone()).await; + + if result.is_err() { + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // + // NB: there may still be other concurrent threads using `proc`. + // The last one will send SIGKILL when the underlying Arc reaches refcount 0. + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster + // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, + // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. + // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. + drop(proc); + } + + result + } + /// /// Process one request for WAL redo using wal-redo postgres /// @@ -320,130 +411,63 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = match self.redo_process.get_or_init_detached().await { - Ok(guard) => match &*guard { - ProcessOnceCell::Spawned(proc) => Arc::clone(proc), - ProcessOnceCell::ManagerShutDown => { - return Err(Error::Cancelled); - } - }, - Err(permit) => { - let start = Instant::now(); - // acquire guard before spawning process, so that we don't spawn new processes - // if the gate is already closed. - let _launched_processes_guard = match self.launched_processes.enter() { - Ok(guard) => guard, - Err(GateError::GateClosed) => unreachable!( - "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" - ), - }; - let proc = Arc::new(Process { - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - _launched_processes_guard, - }); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process - .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); - proc - } - }; + let base_img = &base_img; + let closure = |proc: Arc| async move { + let started_at = std::time::Instant::now(); - let started_at = std::time::Instant::now(); + // Relational WAL records are applied using wal-redo-postgres + let result = proc + .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await + .context("apply_wal_records"); - // Relational WAL records are applied using wal-redo-postgres - let result = proc - .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) - .await - .context("apply_wal_records"); + let duration = started_at.elapsed(); - let duration = started_at.elapsed(); - - let len = records.len(); - let nbytes = records.iter().fold(0, |acumulator, record| { - acumulator - + match &record.1 { - NeonWalRecord::Postgres { rec, .. } => rec.len(), - _ => unreachable!("Only PostgreSQL records are accepted in this batch"), - } - }); - - WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); - WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); - - debug!( - "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", - len, - nbytes, - duration.as_micros(), - lsn - ); - - // If something went wrong, don't try to reuse the process. Kill it, and - // next request will launch a new one. - if let Err(e) = result.as_ref() { - error!( - "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", - records.len(), - records.first().map(|p| p.0).unwrap_or(Lsn(0)), - records.last().map(|p| p.0).unwrap_or(Lsn(0)), - nbytes, - base_img_lsn, - lsn, - n_attempts, - e, - ); - // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. - // Note that there may be other tasks concurrent with us that also hold `proc`. - // We have to deal with that here. - // Also read the doc comment on field `self.redo_process`. - // - // NB: there may still be other concurrent threads using `proc`. - // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // - // NB: the drop impl blocks the dropping thread with a wait() system call for - // the child process. In some ways the blocking is actually good: if we - // deferred the waiting into the background / to tokio if we used `tokio::process`, - // it could happen that if walredo always fails immediately, we spawn processes faster - // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, - // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. - // This probably needs revisiting at some later point. - match self.redo_process.get() { - None => (), - Some(guard) => { - match &*guard { - ProcessOnceCell::ManagerShutDown => {} - ProcessOnceCell::Spawned(guard_proc) => { - if Arc::ptr_eq(&proc, guard_proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - guard.take_and_deinit(); - } else { - // Another task already spawned another redo process (further up in this method) - // and put it into `redo_process`. Do nothing, our view of the world is behind. - } - } + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), } - } + }); + + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + + debug!( + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, + duration.as_micros(), + lsn + ); + + if let Err(e) = result.as_ref() { + error!( + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), + nbytes, + base_img_lsn, + lsn, + n_attempts, + e, + ); } - // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. - drop(proc); - } else if n_attempts != 0 { + + result.map_err(Error::Other) + }; + let result = self.do_with_walredo_process(pg_version, closure).await; + + if result.is_ok() && n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result.map_err(Error::Other); + return result; } } } diff --git a/patches/pg_hintplan.patch b/patches/pg_hint_plan.patch similarity index 55% rename from patches/pg_hintplan.patch rename to patches/pg_hint_plan.patch index 61a5ecbb90..4039a036df 100644 --- a/patches/pg_hintplan.patch +++ b/patches/pg_hint_plan.patch @@ -1,13 +1,7 @@ -commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master) -Author: Alexey Masterov -Date: Thu Jun 6 08:02:42 2024 +0000 - - Patch expected files to consider Neon's log messages - -diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out -index da723b8..f8d0102 100644 ---- a/ext-src/pg_hint_plan-src/expected/ut-A.out -+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out +diff --git a/expected/ut-A.out b/expected/ut-A.out +index da723b8..5328114 100644 +--- a/expected/ut-A.out ++++ b/expected/ut-A.out @@ -9,13 +9,16 @@ SET search_path TO public; ---- -- No.A-1-1-3 @@ -25,10 +19,18 @@ index da723b8..f8d0102 100644 DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern -diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON (s.dbid = d.oid) ++ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' + ORDER BY 1; + query | calls + --------------------------------------+------- +diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out index d372459..6282afe 100644 ---- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out -+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +--- a/expected/ut-fdw.out ++++ b/expected/ut-fdw.out @@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; @@ -37,3 +39,15 @@ index d372459..6282afe 100644 CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); +diff --git a/sql/ut-A.sql b/sql/ut-A.sql +index 7c7d58a..4fd1a07 100644 +--- a/sql/ut-A.sql ++++ b/sql/ut-A.sql +@@ -963,6 +963,7 @@ SELECT s.query, s.calls + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d + ON (s.dbid = d.oid) ++ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%' + ORDER BY 1; + + ---- diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h new file mode 100644 index 0000000000..0a131816ef --- /dev/null +++ b/pgxn/neon/bitmap.h @@ -0,0 +1,12 @@ +#ifndef NEON_BITMAP_H +#define NEON_BITMAP_H + +/* + * Utilities for manipulating bits8* as bitmaps. + */ + +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + +#endif //NEON_BITMAP_H diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 479209a537..ab6739465b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -27,6 +27,7 @@ #include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" +#include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" @@ -40,6 +41,7 @@ #include "utils/guc.h" #include "hll.h" +#include "bitmap.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +int +lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + int nblocks, bits8 *bitmap) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 chunk_offs; + int found = 0; + uint32 hash; + int i = 0; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return 0; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + + LWLockAcquire(lfc_lock, LW_SHARED); + + while (true) + { + int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + if (LFC_ENABLED()) + { + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + { + if ((entry->bitmap[chunk_offs >> 5] & + (1 << (chunk_offs & 31))) != 0) + { + BITMAP_SET(bitmap, i); + found++; + } + } + } + else + { + i += this_chunk; + } + } + else + { + return found; + } + + /* + * Break out of the iteration before doing expensive stuff for + * a next iteration + */ + if (i + 1 >= nblocks) + break; + + /* + * Prepare for the next iteration. We don't unlock here, as that'd + * probably be more expensive than the gains it'd get us. + */ + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + } + + LWLockRelease(lfc_lock); + +#if USE_ASSERT_CHECKING + do { + int count = 0; + + for (int j = 0; j < nblocks; j++) + { + if (BITMAP_ISSET(bitmap, j)) + count++; + } + + Assert(count == found); + } while (false); +#endif + + return found; +} + /* * Evict a page (if present) from the local file cache */ @@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Try to read page from local cache. - * Returns true if page is found in local cache. - * In case of error local file cache is disabled (lfc->limit is set to zero). + * Try to read pages from local cache. + * Returns the number of pages read from the local cache, and sets bits in + * 'read' for the pages which were read. This may scribble over buffers not + * marked in 'read', so be careful with operation ordering. + * + * In case of error local file cache is disabled (lfc->limit is set to zero), + * and -1 is returned. Note that 'read' and the buffers may be touched and in + * an otherwise invalid state. + * + * If the mask argument is supplied, bits will be set at the offsets of pages + * that were present and read from the LFC. */ -bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +int +lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); bool result = true; uint32 hash; uint64 generation; uint32 entry_offset; + int blocks_read = 0; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return false; + return 0; if (!lfc_ensure_opened()) - return false; + return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int iteration_hits = 0; + int iteration_misses = 0; + Assert(blocks_in_chunk > 0); + + for (int i = 0; i < blocks_in_chunk; i++) + { + iov[i].iov_base = buffers[buf_offset + i]; + iov[i].iov_len = BLCKSZ; + } + + tag.blockNum = blkno - chunk_offs; + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* We can return the blocks we've read before LFC got disabled; + * assuming we read any. */ + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return blocks_read; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + + if (entry == NULL) + { + /* Pages are not cached */ + lfc_ctl->misses += blocks_in_chunk; + pgBufferUsage.file_cache.misses += blocks_in_chunk; + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + + continue; + } + + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + LWLockRelease(lfc_lock); - return false; - } - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + for (int i = 0; i < blocks_in_chunk; i++) + { + /* + * If the page is valid, we consider it "read". + * All other pages will be fetched separately by the next cache + */ + if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + { + BITMAP_SET(mask, buf_offset + i); + iteration_hits++; + } + else + iteration_misses++; + } - /* Approximate working set */ - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + Assert(iteration_hits + iteration_misses > 0); + + if (iteration_hits != 0) + { + rc = preadv(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + if (rc != (BLCKSZ * blocks_in_chunk)) + { + lfc_disable("read"); + return -1; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + lfc_ctl->hits += iteration_hits; + lfc_ctl->misses += iteration_misses; + pgBufferUsage.file_cache.hits += iteration_hits; + pgBufferUsage.file_cache.misses += iteration_misses; + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } + else + { + /* generation mismatch, assume error condition */ + LWLockRelease(lfc_lock); + return -1; + } - if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) - { - /* Page is not cached */ - lfc_ctl->misses += 1; - pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); - return false; - } - /* Unlink entry from LRU list to pin it for the duration of IO operation */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - generation = lfc_ctl->generation; - entry_offset = entry->offset; - LWLockRelease(lfc_lock); - - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("read"); - return false; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + blocks_read += iteration_hits; } - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - lfc_ctl->hits += 1; - pgBufferUsage.file_cache.hits += 1; - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - } - else - result = false; - - LWLockRelease(lfc_lock); - - return result; + return blocks_read; } /* @@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -#if PG_MAJORVERSION_NUM < 16 -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) -#endif +lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); uint32 hash; uint64 generation; uint32 entry_offset; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; @@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (!lfc_ensure_opened()) return; - tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return; - } + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + Assert(blocks_in_chunk > 0); - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - if (found) - { - /* - * Unlink entry from LRU list to pin it for the duration of IO - * operation - */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - } - else - { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + for (int i = 0; i < blocks_in_chunk; i++) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); + iov[i].iov_len = BLCKSZ; } - else if (!dlist_is_empty(&lfc_ctl->holes)) + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED()) { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool found; + LWLockRelease(lfc_lock); + return; + } - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); - CriticalAssert(found); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + if (found) + { + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); } else { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ - } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; - entry_offset = entry->offset; - lfc_ctl->writes += 1; - LWLockRelease(lfc_lock); - - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("write"); - } - else - { - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - /* Place entry to the head of LRU list */ - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - - entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + /* + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. + */ + if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ + } + entry->access_count = 1; + entry->hash = hash; + memset(entry->bitmap, 0, sizeof entry->bitmap); } + generation = lfc_ctl->generation; + entry_offset = entry->offset; + lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + + rc = pwritev(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + if (rc != BLCKSZ * blocks_in_chunk) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + for (int i = 0; i < blocks_in_chunk; i++) + { + entry->bitmap[(chunk_offs + i) >> 5] |= + (1 << ((chunk_offs + i) & 31)); + } + } + + LWLockRelease(lfc_lock); + } + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 73a001b6ba..df7000acc0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel) /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); +#if PG_MAJORVERSION_NUM >= 17 + shard->wes_read = CreateWaitEventSet(NULL, 3); +#else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); +#endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, @@ -550,9 +554,6 @@ pageserver_connect(shardno_t shard_no, int elevel) case 2: pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; - case 1: - pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); - break; default: elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); } @@ -1063,7 +1064,7 @@ pg_init_libpagestore(void) NULL, &neon_protocol_version, 2, /* use protocol version 2 */ - 1, /* min */ + 2, /* min */ 2, /* max */ PGC_SU_BACKEND, 0, /* no flags required */ diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index addb6ccce6..59b97d64fe 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -6,7 +6,11 @@ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H +#if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) +#else +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) +#endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ @@ -50,7 +54,7 @@ #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) tag.rnode @@ -98,7 +102,7 @@ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ @@ -113,4 +117,10 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#if PG_MAJORVERSION_NUM < 17 +#define ProcNumber BackendId +#define INVALID_PROC_NUMBER InvalidBackendId +#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) +#endif + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 8951e6607b..4c9e40a063 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -6,8 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * contrib/neon/pagestore_client.h - * *------------------------------------------------------------------------- */ #ifndef pageserver_h @@ -87,9 +85,8 @@ typedef enum { * can skip traversing through recent layers which we know to not contain any * versions for the requested page. * - * These structs describe the V2 of these requests. The old V1 protocol contained - * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is - * set to 1, we will convert these to the V1 requests before sending. + * These structs describe the V2 of these requests. (The old now-defunct V1 + * protocol contained just one LSN and a boolean 'latest' flag.) */ typedef struct { @@ -188,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg); * API */ -typedef unsigned shardno_t; +typedef uint16 shardno_t; typedef struct { @@ -212,7 +209,7 @@ extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); -extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -234,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbuffers, bool skipFsync); #endif +#if PG_MAJORVERSION_NUM >=17 +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif /* * LSN values associated with each request to the pageserver @@ -270,19 +272,11 @@ typedef struct } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, char *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); #else -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); #endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); @@ -300,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -#if PG_MAJORVERSION_NUM < 16 -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer); -#else -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer); -#endif -extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8edaf65639..36538ea5e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,6 +58,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" +#include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" @@ -66,6 +67,7 @@ #include "storage/smgr.h" #include "pagestore_client.h" +#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -170,16 +172,28 @@ typedef enum PrefetchStatus * valid */ } PrefetchStatus; +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_SEQ = 0x1, +} PrefetchRequestFlags; + typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ - PrefetchStatus status; - shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; +StaticAssertDecl(sizeof(PrefetchRequest) == 64, + "We prefer to have a power-of-2 size for this struct. Please" + " try to find an alternative solution before reaching to" + " increase the expected size here"); + /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -251,17 +265,17 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) -#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) -#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) - static PrefetchState *MyPState; +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + GetPrfSlotNoCheck(ring_index) \ ) \ ) @@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +#endif -static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); -static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks, const bits8 *mask); +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool @@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns if (force_request_lsns) slot->request_lsns = *force_request_lsns; else - slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum); + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1, NULL); request.req.lsn = slot->request_lsns.request_lsn; request.req.not_modified_since = slot->request_lsns.not_modified_since; @@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns */ static uint64 -prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) { - uint64 ring_index; + uint64 min_ring_index; PrefetchRequest req; - PrefetchRequest *slot; - PrfHashEntry *entry; +#if USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; + Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); - - if (entry != NULL) + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; - /* - * If the caller specified a request LSN to use, only accept prefetch - * responses that satisfy that request. - */ - if (force_request_lsns) - { - if (!neon_prefetch_response_usable(*force_request_lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - } +#if USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + req.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag)); + /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. */ - if (slot->status == PRFS_TAG_REMAINS) + if (lsns) { - prefetch_set_unused(ring_index); - entry = NULL; + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); } else { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } - } - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page unnecessarily - * in that case. If the oldest slot holds a request that we haven't - * received a response for yet, we have to wait for the response to that - * before we can continue. We might not have even flushed the request to - * the pageserver yet, it might be just sitting in the output buffer. In - * that case, we flush it and wait for the response. (We could decide not - * to send it, but it's hard to abort when the request is already in the - * output buffer, and 'not sending' a prefetch request kind of goes - * against the principles of prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); - } - } + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = req.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + + min_ring_index = Min(min_ring_index, ring_index); + + prefetch_do_request(slot, lsns); } - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + Assert(any_hits); - Assert(MyPState->ring_last <= ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = tag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - - prefetch_do_request(slot, force_request_lsns); - Assert(slot->status == PRFS_REQUESTED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -921,9 +996,17 @@ Retry: MyPState->ring_flush = MyPState->ring_unused; } - return ring_index; + return min_ring_index; } + +static uint64 +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +{ + return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL); +} + + /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. @@ -1001,51 +1084,10 @@ nm_pack_request(NeonRequest *msg) initStringInfo(&s); - if (neon_protocol_version >= 2) - { - pq_sendbyte(&s, msg->tag); - pq_sendint64(&s, msg->lsn); - pq_sendint64(&s, msg->not_modified_since); - } - else - { - bool latest; - XLogRecPtr lsn; + pq_sendbyte(&s, msg->tag); + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); - /* - * In primary, we always request the latest page version. - */ - if (!RecoveryInProgress()) - { - latest = true; - lsn = msg->not_modified_since; - } - else - { - /* - * In the protocol V1, we cannot represent that we want to read - * page at LSN X, and we know that it hasn't been modified since - * Y. We can either use 'not_modified_lsn' as the request LSN, and - * risk getting an error if that LSN is too old and has already - * fallen out of the pageserver's GC horizon, or we can send - * 'request_lsn', causing the pageserver to possibly wait for the - * recent WAL to arrive unnecessarily. Or something in between. We - * choose to use the old LSN and risk GC errors, because that's - * what we've done historically. - */ - latest = false; - lsn = msg->not_modified_since; - } - - pq_sendbyte(&s, msg->tag); - pq_sendbyte(&s, latest); - pq_sendint64(&s, lsn); - } - - /* - * The rest of the request messages are the same between protocol V1 and - * V2 - */ switch (messageTag(msg)) { /* pagestore_client -> pagestore */ @@ -1389,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } +#if PG_MAJORVERSION_NUM >= 17 +/* + * Wrapper around log_newpages() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, + BlockNumber nblocks, Page *pages, bool page_std) +{ + PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pageptrs[XLR_MAX_BLOCK_ID]; + int nregistered = 0; + XLogRecPtr result = 0; + + for (int i = 0; i < nblocks; i++) + { + Page page = copied_buffer[nregistered].data; + memcpy(page, pages[i], BLCKSZ); + pageptrs[nregistered] = page; + blknos[nregistered] = blkno + i; + + ++nregistered; + + if (nregistered >= XLR_MAX_BLOCK_ID) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + nregistered = 0; + } + } + + if (nregistered != 0) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + } + + return ProcLastRecPtr; +} +#endif /* PG_MAJORVERSION_NUM >= 17 */ + /* * Is 'buffer' identical to a freshly initialized empty heap page? */ @@ -1402,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +#if PG_MAJORVERSION_NUM >= 17 +static void +neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, const char **buffers, bool force) +{ +#define BLOCK_BATCH_SIZE 16 + bool log_pages; + BlockNumber batch_blockno = blocknum; + XLogRecPtr lsns[BLOCK_BATCH_SIZE]; + int batch_size = 0; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + log_pages = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_pages = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_pages = true; + } + + if (log_pages) + { + XLogRecPtr recptr; + recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + nblocks, (Page *) buffers, false); + + for (int i = 0; i < nblocks; i++) + PageSetLSN(unconstify(char *, buffers[i]), recptr); + + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " + "were force logged, lsn=%X/%X", + blocknum, blocknum + nblocks, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(recptr)))); + } + + for (int i = 0; i < nblocks; i++) + { + Page page = (Page) buffers[i]; + BlockNumber blkno = blocknum + i; + XLogRecPtr lsn = PageGetLSN(page); + + if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (PageIsEmptyHeapPage(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) + { + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ + } + } + else + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + lsns[batch_size++] = lsn; + + if (batch_size >= BLOCK_BATCH_SIZE) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + batch_blockno += batch_size; + batch_size = 0; + } + } + + if (batch_size != 0) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + } +} +#endif + /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ -static void #if PG_MAJORVERSION_NUM < 16 +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { @@ -1589,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn) return lsn; } + +/* + * Since PG17 we use vetorized version, + * so add compatibility function for older versions + */ +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); +} +#endif + /* * Return LSN for requesting pages and number of blocks from page server */ -static neon_request_lsns -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + neon_request_lsns *output, BlockNumber nblocks, + const bits8 *mask) { - XLogRecPtr last_written_lsn; - neon_request_lsns result; + XLogRecPtr last_written_lsns[PG_IOV_MAX]; - last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - last_written_lsn = nm_adjust_lsn(last_written_lsn); - Assert(last_written_lsn != InvalidXLogRecPtr); + Assert(nblocks <= PG_IOV_MAX); + + GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); + + for (int i = 0; i < nblocks; i++) + { + last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); + Assert(last_written_lsns[i] != InvalidXLogRecPtr); + } if (RecoveryInProgress()) { @@ -1671,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - if (last_written_lsn > replay_lsn) + for (int i = 0; i < nblocks; i++) { - /* GetCurrentReplayRecPtr was introduced in v15 */ + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 - Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif - /* - * Cases 2 and 4. If this is a backend (case 4), the - * neon_read_at_lsn() call later will wait for the WAL record to be - * fully replayed. - */ - result.request_lsn = last_written_lsn; - } - else - { - /* cases 1 and 3 */ - result.request_lsn = replay_lsn; - } - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = result.request_lsn; - Assert(last_written_lsn <= result.request_lsn); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result->request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result->request_lsn = replay_lsn; + } - neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = result->request_lsn; + Assert(last_written_lsn <= result->request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); + } } else { XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache as the - * 'not_modified_since' hint. Any pages modified by later WAL records - * must still in the buffer cache, so our request cannot concern - * those. - */ - neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", - LSN_FORMAT_ARGS(last_written_lsn)); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index - * building, _bt_blwritepage logs the full page without flushing WAL - * before smgrextend (files are fsynced before build ends). - */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif - if (last_written_lsn > flushlsn) + + for (int i = 0; i < nblocks; i++) { - neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - LSN_FORMAT_ARGS(last_written_lsn), - LSN_FORMAT_ARGS(flushlsn)); - XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + /* + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. + */ + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ + if (last_written_lsn > flushlsn) + { + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; + } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result->request_lsn = UINT64_MAX; + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = flushlsn; } - - /* - * Request the very latest version of the page. In principle we - * want to read the page at the current insert LSN, and we could - * use that value in the request. However, there's a corner case - * with pageserver's garbage collection. If the GC horizon is - * set to a very small value, it's possible that by the time - * that the pageserver processes our request, the GC horizon has - * already moved past the LSN we calculate here. Standby servers - * always have that problem as the can always lag behind the - * primary, but for the primary we can avoid it by always - * requesting the latest page, by setting request LSN to - * UINT64_MAX. - * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. - */ - result.request_lsn = UINT64_MAX; - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = flushlsn; } - - return result; } /* @@ -1769,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) * satisfy a page read now. */ static bool -neon_prefetch_response_usable(neon_request_lsns request_lsns, +neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); @@ -1796,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; @@ -1858,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1927,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, @@ -2109,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2190,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2288,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } +#if PG_MAJORVERSION_NUM >= 17 +/* + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; + bool io_initiated = false; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum, nblocks); + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + tag.spcOid = reln->smgr_rlocator.locator.spcOid; + tag.dbOid = reln->smgr_rlocator.locator.dbOid; + tag.relNumber = reln->smgr_rlocator.locator.relNumber; + tag.forkNum = forknum; + + while (nblocks > 0) + { + int iterblocks = Min(nblocks, PG_IOV_MAX); + int seqlen = 0; + bits8 lfc_present[PG_IOV_MAX / 8]; + memset(lfc_present, 0, sizeof(lfc_present)); + + if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, + iterblocks, lfc_present) == iterblocks) + { + nblocks -= iterblocks; + blocknum += iterblocks; + continue; + } + + io_initiated = true; + + tag.blockNum = blocknum; + + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_present[i] = ~(lfc_present[i]); + + ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, + lfc_present); + nblocks -= iterblocks; + blocknum += iterblocks; + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + } + + return false; +} + + +#else /* PG_MAJORVERSION_NUM >= 17 */ /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -2326,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return false; } +#endif /* PG_MAJORVERSION_NUM < 17 */ + /* * neon_writeback() -- Tell the kernel to write pages back to storage. @@ -2356,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ + /* + * TODO: WAL sync up to lwLsn for the indicated blocks + * Without that sync, writeback doesn't actually guarantee the data is + * persistently written, which does seem to be one of the assumed + * properties of this smgr API call. + */ neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL @@ -2365,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * While function is defined in the neon extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void +static void #if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + char **buffers, BlockNumber nblocks, const bits8 *mask) #else -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, void *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) #endif { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - BufferTag buftag = - { - .forkNum = forkNum, - .blockNum = blkno, - }; + BufferTag buftag = {0}; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); CopyNRelFileInfoToBufTag(buftag, rinfo); + buftag.forkNum = forkNum; + buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are @@ -2406,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ - if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsns.request_lsn); + prefetch_register_bufferv(buftag, request_lsns, nblocks, mask); - /* - * Try to find prefetched page in the list of received pages. - */ + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns[0].request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(request_lsns, slot)) + if (entry != NULL) { - ring_index = slot->my_ring_index; - pgBufferUsage.prefetch.hits += 1; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; + ring_index = slot->my_ring_index; + pgBufferUsage.prefetch.hits += 1; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - /* make it look like a prefetch cache miss */ - entry = NULL; } - } - do - { - if (entry == NULL) + do { - pgBufferUsage.prefetch.misses += 1; + if (entry == NULL) + { + pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsns); - slot = GetPrfSlot(ring_index); - } - else + ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rinfo, forkNum, blockno, buffer); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blkno, buffer); - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blkno, - RelFileInfoFmt(rinfo), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); } /* - * neon_read() -- Read the specified block from a relation. + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. */ void #if PG_MAJORVERSION_NUM < 16 +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, char *buffer) +#else +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, void *buffer) +#endif +{ + neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); +} + +#if PG_MAJORVERSION_NUM < 17 +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { @@ -2543,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -2619,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } #endif } +#endif /* PG_MAJORVERSION_NUM <= 16 */ + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + bits8 read[PG_IOV_MAX / 8]; + neon_request_lsns request_lsns[PG_IOV_MAX]; + int lfc_result; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (nblocks > PG_IOV_MAX) + neon_log(ERROR, "Read request too large: %d is larger than max %d", + nblocks, PG_IOV_MAX); + + memset(read, 0, sizeof(read)); + + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read); + + /* Read all blocks from LFC, so we're done */ + if (lfc_result == nblocks) + return; + + if (lfc_result == -1) + { + /* can't use the LFC result, so read all blocks from PS */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = 0xFF; + } + else + { + /* invert the result: exclude blocks read from lfc */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = ~(read[i]); + } + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks, read); + + neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + for (int i = 0; i < nblocks; i++) + { +#if PG_MAJORVERSION_NUM >= 17 + mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); +#else + mdread(reln, forkNum, blkno + i, mdbuf); +#endif + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew((Page) mdbuf)) + { + if (!PageIsNew((Page) pageserver_masked)) + { + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew((Page) buffer)) + { + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } + } +#endif +} +#endif #ifdef DEBUG_COMPARE_LOCAL static char * @@ -2664,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo if (mdexists(reln, forknum)) { /* It exists locally. Guess it's unlogged then. */ +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif + return; + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_page(reln, forknum, blocknum, buffer, false); + + lsn = PageGetLSN((Page) buffer); + neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif +#endif +} + + + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); /* * We could set relpersistence now that we have determined @@ -2682,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; - default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - neon_wallog_page(reln, forknum, blocknum, buffer, false); + neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lsn = PageGetLSN((Page) buffer); - neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, blocknum, - (uint32) (lsn >> 32), (uint32) lsn); - - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #endif } +#endif + /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ @@ -2740,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, @@ -2798,7 +3375,9 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, @@ -2939,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) #endif } +#if PG_MAJORVERSION_NUM >= 17 +void +neon_regisersync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdregistersync(reln, forknum); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} +#endif + + /* * neon_start_unlogged_build() -- Starting build operation on a rel. * @@ -3088,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn, - not_modified_since; + XLogRecPtr request_lsn, + not_modified_since; + SlruKind kind; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -3119,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); - SlruKind kind; - - if (STRPREFIX(path, "pg_xact")) - kind = SLRU_CLOG; - else if (STRPREFIX(path, "pg_multixact/members")) - kind = SLRU_MULTIXACT_MEMBERS; - else if (STRPREFIX(path, "pg_multixact/offsets")) - kind = SLRU_MULTIXACT_OFFSETS; - else - return -1; + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, .req.lsn = request_lsn, .req.not_modified_since = not_modified_since, - .kind = kind, .segno = segno }; - int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do { while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); } while (resp == NULL); @@ -3223,14 +3835,23 @@ static const struct f_smgr neon_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = neon_prefetch, + .smgr_readv = neon_readv, + .smgr_writev = neon_writev, +#else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, +#endif + .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, - +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = neon_regisersync, +#endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, @@ -3239,11 +3860,11 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, NRelFileInfo rinfo) +smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) + if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index cc7ac2c394..2a4c2dc799 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) tag.rinfo = rinfo; tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_SHARED); + /* We need exclusive lock here because of LRU list manipulation */ + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); if (entry != NULL) { diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c53257923a..c1914421ec 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp) if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if previously elected leader was me; - * plain restart of walproposer not intervened by concurrent - * compute (who could generate WAL) is ok. + * However, allow to proceed if last_log_term on the node which gave + * the highest vote (i.e. point where we are going to start writing) + * actually had been won by me; plain restart of walproposer not + * intervened by concurrent compute which wrote WAL is ok. + * + * This avoids compute crash after manual term_bump. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) @@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk) if (sk->appendResponse.term > wp->propTerm) { /* - * Another compute with higher term is running. Panic to restart - * PG as we likely need to retake basebackup. However, don't dump - * core as this is kinda expected scenario. + * + * Term has changed to higher one, probably another compute is + * running. If this is the case we could PANIC as well because + * likely it inserted some data and our basebackup is unsuitable + * anymore. However, we also bump term manually (term_bump endpoint) + * on safekeepers for migration purposes, in this case we do want + * compute to stay alive. So restart walproposer with FATAL instead + * of panicking; if basebackup is spoiled next election will notice + * this. */ - disable_core_dump(); - wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); } diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index f3ddc64061..4d0d06e6de 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -81,6 +81,7 @@ static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); @@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); -static process_interrupts_callback_t PrevProcessInterruptsCallback; +static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static shmem_startup_hook_type prev_shmem_startup_hook_type; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -178,7 +179,7 @@ pg_init_walproposer(void) nwp_prepare_shmem(); - delay_backend_us = &backpressure_lag_impl; + delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -220,6 +221,64 @@ nwp_register_gucs(void) NULL, NULL, NULL); } + +static int +split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) +{ + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; + + for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) + { + if (++n_safekeepers >= MAX_SAFEKEEPERS) { + wpg_log(FATAL, "too many safekeepers"); + } + + coma = strchr(coma, ','); + safekeepers[n_safekeepers-1] = curr_sk; + + if (coma != NULL) { + *coma++ = '\0'; + } + } + + return n_safekeepers; +} + +/* + * Accept two coma-separated strings with list of safekeeper host:port addresses. + * Split them into arrays and return false if two sets do not match, ignoring the order. + */ +static bool +safekeepers_cmp(char *old, char *new) +{ + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; + + len_old = split_safekeepers_list(old, safekeepers_old); + len_new = split_safekeepers_list(new, safekeepers_new); + + if (len_old != len_new) + { + return false; + } + + qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp); + qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp); + + for (int i = 0; i < len_new; i++) + { + if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0) + { + return false; + } + } + + return true; +} + /* * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if * the list changed. @@ -235,19 +294,26 @@ assign_neon_safekeepers(const char *newval, void *extra) wpg_log(FATAL, "neon.safekeepers is empty"); } + /* Copy values because we will modify them in split_safekeepers_list() */ + char *newval_copy = pstrdup(newval); + char *oldval = pstrdup(wal_acceptors_list); + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before * next bgw start. We should refactor walproposer to allow graceful exit and * thus remove this delay. + * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. */ - if (strcmp(wal_acceptors_list, newval) != 0) + if (!safekeepers_cmp(oldval, newval_copy)) { wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", wal_acceptors_list, newval); } + pfree(newval_copy); + pfree(oldval); } -/* Check if we need to suspend inserts because of lagging replication. */ +/* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) { @@ -287,6 +353,22 @@ backpressure_lag_impl(void) return 0; } +/* + * We don't apply backpressure when we're the postmaster, or the startup + * process, because in postmaster we can't apply backpressure, and in + * the startup process we can't afford to slow down. + */ +static uint64 +startup_backpressure_wrap(void) +{ + if (AmStartupProcess() || !IsUnderPostmaster) + return 0; + + delay_backend_us = &backpressure_lag_impl; + + return backpressure_lag_impl(); +} + /* * WalproposerShmemSize --- report amount of shared memory space needed */ @@ -336,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void) static bool backpressure_throttling_impl(void) { - int64 lag; + uint64 lag; TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + bool retry = false; + + if (PointerIsValid(PrevProcessInterruptsCallback)) + retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE @@ -537,7 +620,12 @@ walprop_pg_init_walsender(void) /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { +#if PG_MAJORVERSION_NUM >= 17 + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, + false, false, false); +#else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); +#endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); @@ -1444,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ +#if PG_MAJORVERSION_NUM >= 17 + waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); +#else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); +#endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c index f327e132e9..66032c88f6 100644 --- a/pgxn/neon_rmgr/neon_rmgr_decode.c +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -1,6 +1,7 @@ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 + #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" @@ -9,6 +10,10 @@ #include "neon_rmgr.h" +#endif /* PG >= 16 */ + +#if PG_MAJORVERSION_NUM == 16 + /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); @@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } +#endif + +#if PG_MAJORVERSION_NUM == 17 + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); -#endif \ No newline at end of file +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + HeapTuple tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->t_tableOid = InvalidOid; + + tuple->t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, HeapTuple tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->t_len = datalen + SizeofHeapTupleHeader; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} +#endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 4e604a710c..a45e8f5c4a 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM >= 17 +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - +#if PG_MAJORVERSION_NUM >= 17 +static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); +#endif /* * inmem_init() -- Initialize private state @@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + return true; +} +#else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } +#endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. @@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ +#if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, -#if PG_MAJORVERSION_NUM < 16 char *buffer) #else +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { @@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, memcpy(buffer, page_body[pg], BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + void **buffers, BlockNumber nblocks) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_read(reln, forknum, blkno, buffers[i]); + } +} +#endif + /* * inmem_write() -- Write the supplied block at the appropriate location. * @@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, memcpy(page_body[pg], buffer, BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_write(reln, forknum, blkno, buffers[i], skipFsync); + } +} +#endif + /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ @@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_registersync(SMgrRelation reln, ForkNumber forknum) +{ +} +#endif + static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, @@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = inmem_prefetch, + .smgr_readv = inmem_readv, + .smgr_writev = inmem_writev, +#else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, +#endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, + +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = inmem_registersync, +#endif + + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, + .smgr_read_slru_segment = NULL, }; const f_smgr * -smgr_inmem(BackendId backend, NRelFileInfo rinfo) +smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rinfo); - else - return &inmem_smgr; + // // What does this code do? + // if (backend != INVALID_PROC_NUMBER) + // return smgr_standard(backend, rinfo); + // else + return &inmem_smgr; } void diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index 58b98b8e6a..91f1c80965 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index cc545393f5..219ca85207 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -100,6 +100,9 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#if PG_MAJORVERSION_NUM >= 17 +#include "storage/dsm_registry.h" +#endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -137,7 +140,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE DEBUG5 +#define TRACE LOG #ifdef HAVE_LIBSECCOMP @@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up xlog, clog, and buffers */ +#if PG_MAJORVERSION_NUM >= 17 + DSMRegistryShmemInit(); + VarsupShmemInit(); +#endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); @@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up other modules that need some shared memory space */ +#if PG_MAJORVERSION_NUM < 17 + /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); +#endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ @@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message) target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { diff --git a/poetry.lock b/poetry.lock index 7db91e51f7..48943a73e9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -985,43 +985,38 @@ files = [ [[package]] name = "cryptography" -version = "42.0.4" +version = "43.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, - {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, - {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, - {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, - {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, - {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, - {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, + {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, + {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, + {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, + {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, + {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, + {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, + {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, ] [package.dependencies] @@ -1034,7 +1029,7 @@ nox = ["nox"] pep8test = ["check-sdist", "click", "mypy", "ruff"] sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] [[package]] @@ -1110,13 +1105,13 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "4.0.1" +version = "5.0.0" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, - {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, + {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, + {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, ] [package.dependencies] diff --git a/pre-commit.py b/pre-commit.py index c5ed63ac44..ae432e8225 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -2,6 +2,7 @@ import argparse import enum +import os import subprocess import sys from typing import List @@ -93,7 +94,7 @@ if __name__ == "__main__": "--no-color", action="store_true", help="disable colored output", - default=not sys.stdout.isatty(), + default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb", ) args = parser.parse_args() diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 21d92abb20..6703eb06eb 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -18,7 +18,6 @@ atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true -aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -26,7 +25,6 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true -crossbeam-deque.workspace = true dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true @@ -48,11 +46,9 @@ indexmap.workspace = true ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } -md5.workspace = true measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true -opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true @@ -67,7 +63,6 @@ reqwest.workspace = true reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true -routerify.workspace = true rustc-hash.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -79,7 +74,6 @@ smol_str.workspace = true smallvec.workspace = true socket2.workspace = true subtle.workspace = true -task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } @@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } -tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true diff --git a/proxy/README.md b/proxy/README.md index d1f2e3f27b..8d850737be 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) * postgres uses postgres to select auth secrets of existing roles. Useful for local testing -* link +* web (or link) sends login link for all usernames Also proxy can expose following services to the external world: @@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation ``` If both postgres and proxy are running you may send a SQL query: -```json +```console curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \ -H 'Content-Type: application/json' \ @@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}] }' | jq - +``` +```json { "command": "SELECT", "fields": [ diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 3b3c571129..7c408f817c 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,20 +1,20 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::BackendType; +pub use backend::Backend; mod credentials; -pub use credentials::{ +pub(crate) use credentials::{ check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, }; mod password_hack; -pub use password_hack::parse_endpoint_param; +pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; -pub use flow::*; +pub(crate) use flow::*; use tokio::time::error::Elapsed; use crate::{ @@ -25,13 +25,13 @@ use std::{io, net::IpAddr}; use thiserror::Error; /// Convenience wrapper for the authentication error. -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub enum AuthErrorImpl { +pub(crate) enum AuthErrorImpl { #[error(transparent)] - Link(#[from] backend::LinkAuthError), + Web(#[from] backend::WebAuthError), #[error(transparent)] GetAuthInfo(#[from] console::errors::GetAuthInfoError), @@ -77,30 +77,30 @@ pub enum AuthErrorImpl { #[derive(Debug, Error)] #[error(transparent)] -pub struct AuthError(Box); +pub(crate) struct AuthError(Box); impl AuthError { - pub fn bad_auth_method(name: impl Into>) -> Self { + pub(crate) fn bad_auth_method(name: impl Into>) -> Self { AuthErrorImpl::BadAuthMethod(name.into()).into() } - pub fn auth_failed(user: impl Into>) -> Self { + pub(crate) fn auth_failed(user: impl Into>) -> Self { AuthErrorImpl::AuthFailed(user.into()).into() } - pub fn ip_address_not_allowed(ip: IpAddr) -> Self { + pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { AuthErrorImpl::IpAddressNotAllowed(ip).into() } - pub fn too_many_connections() -> Self { + pub(crate) fn too_many_connections() -> Self { AuthErrorImpl::TooManyConnections.into() } - pub fn is_auth_failed(&self) -> bool { + pub(crate) fn is_auth_failed(&self) -> bool { matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) } - pub fn user_timeout(elapsed: Elapsed) -> Self { + pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { AuthErrorImpl::UserTimeout(elapsed).into() } } @@ -114,7 +114,7 @@ impl> From for AuthError { impl UserFacingError for AuthError { fn to_string_client(&self) -> String { match self.0.as_ref() { - AuthErrorImpl::Link(e) => e.to_string_client(), + AuthErrorImpl::Web(e) => e.to_string_client(), AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), AuthErrorImpl::Sasl(e) => e.to_string_client(), AuthErrorImpl::AuthFailed(_) => self.to_string(), @@ -132,7 +132,7 @@ impl UserFacingError for AuthError { impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self.0.as_ref() { - AuthErrorImpl::Link(e) => e.get_error_kind(), + AuthErrorImpl::Web(e) => e.get_error_kind(), AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), AuthErrorImpl::Sasl(e) => e.get_error_kind(), AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 7592d076ec..5561c9c56d 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,17 +1,19 @@ mod classic; mod hacks; pub mod jwt; -mod link; +pub mod local; +mod web; use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; use ipnet::{Ipv4Net, Ipv6Net}; -pub use link::LinkAuthError; +use local::LocalBackend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; +pub(crate) use web::WebAuthError; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::{validate_password_and_exchange, AuthError}; @@ -63,25 +65,27 @@ impl std::ops::Deref for MaybeOwned<'_, T> { /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum BackendType<'a, T, D> { +pub enum Backend<'a, T, D> { /// Cloud API (V2). Console(MaybeOwned<'a, ConsoleBackend>, T), /// Authentication via a web browser. - Link(MaybeOwned<'a, url::ApiUrl>, D), + Web(MaybeOwned<'a, url::ApiUrl>, D), + /// Local proxy uses configured auth credentials and does not wake compute + Local(MaybeOwned<'a, LocalBackend>), } -pub trait TestBackend: Send + Sync + 'static { +#[cfg(test)] +pub(crate) trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; fn get_allowed_ips_and_secret( &self, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; - fn get_role_secret(&self) -> Result; } -impl std::fmt::Display for BackendType<'_, (), ()> { +impl std::fmt::Display for Backend<'_, (), ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Console(api, _) => match &**api { + Self::Console(api, ()) => match &**api { ConsoleBackend::Console(endpoint) => { fmt.debug_tuple("Console").field(&endpoint.url()).finish() } @@ -92,71 +96,76 @@ impl std::fmt::Display for BackendType<'_, (), ()> { #[cfg(test)] ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(), }, - Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(), + Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl BackendType<'_, T, D> { +impl Backend<'_, T, D> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub fn as_ref(&self) -> BackendType<'_, &T, &D> { + pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { match self { - Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x), - Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x), + Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x), + Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x), + Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T, D> BackendType<'a, T, D> { +impl<'a, T, D> Backend<'a, T, D> { /// Very similar to [`std::option::Option::map`]. - /// Maps [`BackendType`] to [`BackendType`] by applying + /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> { + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { match self { - Self::Console(c, x) => BackendType::Console(c, f(x)), - Self::Link(c, x) => BackendType::Link(c, x), + Self::Console(c, x) => Backend::Console(c, f(x)), + Self::Web(c, x) => Backend::Web(c, x), + Self::Local(l) => Backend::Local(l), } } } -impl<'a, T, D, E> BackendType<'a, Result, D> { +impl<'a, T, D, E> Backend<'a, Result, D> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub(crate) fn transpose(self) -> Result, E> { match self { - Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)), - Self::Link(c, x) => Ok(BackendType::Link(c, x)), + Self::Console(c, x) => x.map(|x| Backend::Console(c, x)), + Self::Web(c, x) => Ok(Backend::Web(c, x)), + Self::Local(l) => Ok(Backend::Local(l)), } } } -pub struct ComputeCredentials { - pub info: ComputeUserInfo, - pub keys: ComputeCredentialKeys, +pub(crate) struct ComputeCredentials { + pub(crate) info: ComputeUserInfo, + pub(crate) keys: ComputeCredentialKeys, } #[derive(Debug, Clone)] -pub struct ComputeUserInfoNoEndpoint { - pub user: RoleName, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfoNoEndpoint { + pub(crate) user: RoleName, + pub(crate) options: NeonOptions, } #[derive(Debug, Clone)] -pub struct ComputeUserInfo { - pub endpoint: EndpointId, - pub user: RoleName, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfo { + pub(crate) endpoint: EndpointId, + pub(crate) user: RoleName, + pub(crate) options: NeonOptions, } impl ComputeUserInfo { - pub fn endpoint_cache_key(&self) -> EndpointCacheKey { + pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey { self.options.get_cache_key(&self.endpoint) } } -pub enum ComputeCredentialKeys { +pub(crate) enum ComputeCredentialKeys { Password(Vec), AuthKeys(AuthKeys), + None, } impl TryFrom for ComputeUserInfo { @@ -213,7 +222,7 @@ impl RateBucketInfo { } impl AuthenticationConfig { - pub fn check_rate_limit( + pub(crate) fn check_rate_limit( &self, ctx: &RequestMonitoring, config: &AuthenticationConfig, @@ -289,7 +298,7 @@ async fn auth_quirks( ctx.set_endpoint_id(res.info.endpoint.clone()); let password = match res.keys { ComputeCredentialKeys::Password(p) => p, - ComputeCredentialKeys::AuthKeys(_) => { + ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => { unreachable!("password hack should return a password") } }; @@ -302,7 +311,9 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } @@ -315,21 +326,20 @@ async fn auth_quirks( }; let (cached_entry, secret) = cached_secret.take_value(); - let secret = match secret { - Some(secret) => config.check_rate_limit( + let secret = if let Some(secret) = secret { + config.check_rate_limit( ctx, config, secret, &info.endpoint, unauthenticated_password.is_some() || allow_cleartext, - )?, - None => { - // If we don't have an authentication secret, we mock one to - // prevent malicious probing (possible due to missing protocol steps). - // This mocked secret will never lead to successful authentication. - info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) - } + )? + } else { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) }; match authenticate_with_secret( @@ -395,33 +405,26 @@ async fn authenticate_with_secret( classic::authenticate(ctx, info, client, config, secret).await } -impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { - /// Get compute endpoint name from the credentials. - pub fn get_endpoint(&self) -> Option { - match self { - Self::Console(_, user_info) => user_info.endpoint_id.clone(), - Self::Link(_, _) => Some("link".into()), - } - } - +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { /// Get username from the credentials. - pub fn get_user(&self) -> &str { + pub(crate) fn get_user(&self) -> &str { match self { Self::Console(_, user_info) => &user_info.user, - Self::Link(_, _) => "link", + Self::Web(_, ()) => "web", + Self::Local(_) => "local", } } /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] - pub async fn authenticate( + pub(crate) async fn authenticate( self, ctx: &RequestMonitoring, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result> { let res = match self { Self::Console(api, user_info) => { info!( @@ -440,15 +443,18 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { endpoint_rate_limiter, ) .await?; - BackendType::Console(api, credentials) + Backend::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Self::Link(url, _) => { - info!("performing link authentication"); + Self::Web(url, ()) => { + info!("performing web authentication"); - let info = link::authenticate(ctx, &url, client).await?; + let info = web::authenticate(ctx, &url, client).await?; - BackendType::Link(url, info) + Backend::Web(url, info) + } + Self::Local(_) => { + return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } }; @@ -457,64 +463,72 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { } } -impl BackendType<'_, ComputeUserInfo, &()> { - pub async fn get_role_secret( +impl Backend<'_, ComputeUserInfo, &()> { + pub(crate) async fn get_role_secret( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await, - Self::Link(_, _) => Ok(Cached::new_uncached(None)), + Self::Web(_, ()) => Ok(Cached::new_uncached(None)), + Self::Local(_) => Ok(Cached::new_uncached(None)), } } - pub async fn get_allowed_ips_and_secret( + pub(crate) async fn get_allowed_ips_and_secret( &self, ctx: &RequestMonitoring, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { match self { Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, - Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } #[async_trait::async_trait] -impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())), + Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())), + Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } - fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + fn get_keys(&self) -> &ComputeCredentialKeys { match self { - Self::Console(_, creds) => Some(&creds.keys), - Self::Link(_, _) => None, + Self::Console(_, creds) => &creds.keys, + Self::Web(_, _) => &ComputeCredentialKeys::None, + Self::Local(_) => &ComputeCredentialKeys::None, } } } #[async_trait::async_trait] -impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"), + Self::Web(_, ()) => { + unreachable!("web auth flow doesn't support waking the compute") + } + Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } - fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + fn get_keys(&self) -> &ComputeCredentialKeys { match self { - Self::Console(_, creds) => Some(&creds.keys), - Self::Link(_, _) => None, + Self::Console(_, creds) => &creds.keys, + Self::Web(_, ()) => &ComputeCredentialKeys::None, + Self::Local(_) => &ComputeCredentialKeys::None, } } } @@ -591,6 +605,7 @@ mod tests { rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 56921dd949..e9019ce2cf 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -17,7 +17,7 @@ use tracing::{info, warn}; /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. -pub async fn authenticate_cleartext( +pub(crate) async fn authenticate_cleartext( ctx: &RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, @@ -59,7 +59,7 @@ pub async fn authenticate_cleartext( /// Workaround for clients which don't provide an endpoint (project) name. /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) -pub async fn password_hack_no_authentication( +pub(crate) async fn password_hack_no_authentication( ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index e021a7e23f..1f44e4af5d 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,49 +1,81 @@ -use std::{future::Future, sync::Arc, time::Duration}; +use std::{ + future::Future, + sync::Arc, + time::{Duration, SystemTime}, +}; use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; +use serde::{Deserialize, Deserializer}; use signature::Verifier; use tokio::time::Instant; -use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt}; +use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName}; // TODO(conrad): make these configurable. +const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); const MIN_RENEW: Duration = Duration::from_secs(30); const AUTO_RENEW: Duration = Duration::from_secs(300); const MAX_RENEW: Duration = Duration::from_secs(3600); const MAX_JWK_BODY_SIZE: usize = 64 * 1024; /// How to get the JWT auth rules -pub trait FetchAuthRules: Clone + Send + Sync + 'static { - fn fetch_auth_rules(&self) -> impl Future> + Send; +pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { + fn fetch_auth_rules( + &self, + role_name: RoleName, + ) -> impl Future>> + Send; } -#[derive(Clone)] -struct FetchAuthRulesFromCplane { - #[allow(dead_code)] - endpoint: EndpointIdInt, -} - -impl FetchAuthRules for FetchAuthRulesFromCplane { - async fn fetch_auth_rules(&self) -> anyhow::Result { - Err(anyhow::anyhow!("not yet implemented")) - } -} - -pub struct AuthRules { - jwks_urls: Vec, +pub(crate) struct AuthRule { + pub(crate) id: String, + pub(crate) jwks_url: url::Url, + pub(crate) audience: Option, } #[derive(Default)] -pub struct JwkCache { +pub(crate) struct JwkCache { client: reqwest::Client, - map: DashMap>, + map: DashMap<(EndpointId, RoleName), Arc>, } -pub struct JwkCacheEntryLock { +pub(crate) struct JwkCacheEntry { + /// Should refetch at least every hour to verify when old keys have been removed. + /// Should refetch when new key IDs are seen only every 5 minutes or so + last_retrieved: Instant, + + /// cplane will return multiple JWKs urls that we need to scrape. + key_sets: ahash::HashMap, +} + +impl JwkCacheEntry { + fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> { + self.key_sets.values().find_map(|key_set| { + key_set + .find_key(key_id) + .map(|jwk| (jwk, key_set.audience.as_deref())) + }) + } +} + +struct KeySet { + jwks: jose_jwk::JwkSet, + audience: Option, +} + +impl KeySet { + fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> { + self.jwks + .keys + .iter() + .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id)) + } +} + +pub(crate) struct JwkCacheEntryLock { cached: ArcSwapOption, lookup: tokio::sync::Semaphore, } @@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock { } } -pub struct JwkCacheEntry { - /// Should refetch at least every hour to verify when old keys have been removed. - /// Should refetch when new key IDs are seen only every 5 minutes or so - last_retrieved: Instant, - - /// cplane will return multiple JWKs urls that we need to scrape. - key_sets: ahash::HashMap, -} - impl JwkCacheEntryLock { async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { JwkRenewalPermit::acquire_permit(self).await @@ -79,6 +102,7 @@ impl JwkCacheEntryLock { &self, _permit: JwkRenewalPermit<'_>, client: &reqwest::Client, + role_name: RoleName, auth_rules: &F, ) -> anyhow::Result> { // double check that no one beat us to updating the cache. @@ -91,20 +115,19 @@ impl JwkCacheEntryLock { } } - let rules = auth_rules.fetch_auth_rules().await?; - let mut key_sets = ahash::HashMap::with_capacity_and_hasher( - rules.jwks_urls.len(), - ahash::RandomState::new(), - ); + let rules = auth_rules.fetch_auth_rules(role_name).await?; + let mut key_sets = + ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) - for url in rules.jwks_urls { - let req = client.get(url.clone()); + for rule in rules { + let req = client.get(rule.jwks_url.clone()); // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. + // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. match req.send().await.and_then(|r| r.error_for_status()) { // todo: should we re-insert JWKs if we want to keep this JWKs URL? // I expect these failures would be quite sparse. - Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"), + Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), Ok(r) => { let resp: http::Response = r.into(); match parse_json_body_with_limit::( @@ -113,9 +136,17 @@ impl JwkCacheEntryLock { ) .await { - Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"), + Err(e) => { + tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); + } Ok(jwks) => { - key_sets.insert(url, jwks); + key_sets.insert( + rule.id, + KeySet { + jwks, + audience: rule.audience, + }, + ); } } } @@ -133,7 +164,9 @@ impl JwkCacheEntryLock { async fn get_or_update_jwk_cache( self: &Arc, + ctx: &RequestMonitoring, client: &reqwest::Client, + role_name: RoleName, fetch: &F, ) -> Result, anyhow::Error> { let now = Instant::now(); @@ -141,18 +174,20 @@ impl JwkCacheEntryLock { // if we have no cached JWKs, try and get some let Some(cached) = guard else { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; - return self.renew_jwks(permit, client, fetch).await; + return self.renew_jwks(permit, client, role_name, fetch).await; }; let last_update = now.duration_since(cached.last_retrieved); // check if the cached JWKs need updating. if last_update > MAX_RENEW { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; // it's been too long since we checked the keys. wait for them to update. - return self.renew_jwks(permit, client, fetch).await; + return self.renew_jwks(permit, client, role_name, fetch).await; } // every 5 minutes we should spawn a job to eagerly update the token. @@ -164,7 +199,7 @@ impl JwkCacheEntryLock { let client = client.clone(); let fetch = fetch.clone(); tokio::spawn(async move { - if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await { + if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await { tracing::warn!(error=?e, "could not fetch JWKs in background job"); } }); @@ -178,8 +213,10 @@ impl JwkCacheEntryLock { async fn check_jwt( self: &Arc, - jwt: String, + ctx: &RequestMonitoring, + jwt: &str, client: &reqwest::Client, + role_name: RoleName, fetch: &F, ) -> Result<(), anyhow::Error> { // JWT compact form is defined to be @@ -187,38 +224,38 @@ impl JwkCacheEntryLock { // where Signature = alg( || . || ); let (header_payload, signature) = jwt - .rsplit_once(".") + .rsplit_once('.') .context("Provided authentication token is not a valid JWT encoding")?; - let (header, _payload) = header_payload - .split_once(".") + let (header, payload) = header_payload + .split_once('.') .context("Provided authentication token is not a valid JWT encoding")?; let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) .context("Provided authentication token is not a valid JWT encoding")?; - let header = serde_json::from_slice::>(&header) + let header = serde_json::from_slice::>(&header) .context("Provided authentication token is not a valid JWT encoding")?; let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) .context("Provided authentication token is not a valid JWT encoding")?; ensure!(header.typ == "JWT"); - let kid = header.kid.context("missing key id")?; + let kid = header.key_id.context("missing key id")?; - let mut guard = self.get_or_update_jwk_cache(client, fetch).await?; + let mut guard = self + .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch) + .await?; // get the key from the JWKs if possible. If not, wait for the keys to update. - let jwk = loop { - let jwk = guard - .key_sets - .values() - .flat_map(|jwks| &jwks.keys) - .find(|jwk| jwk.prm.kid.as_deref() == Some(kid)); - - match jwk { + let (jwk, expected_audience) = loop { + match guard.find_jwk_and_audience(kid) { Some(jwk) => break jwk, None if guard.last_retrieved.elapsed() > MIN_RENEW => { + let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let permit = self.acquire_permit().await; - guard = self.renew_jwks(permit, client, fetch).await?; + guard = self + .renew_jwks(permit, client, role_name.clone(), fetch) + .await?; } _ => { bail!("jwk not found"); @@ -227,7 +264,7 @@ impl JwkCacheEntryLock { }; ensure!( - jwk.is_supported(&header.alg), + jwk.is_supported(&header.algorithm), "signature algorithm not supported" ); @@ -241,31 +278,57 @@ impl JwkCacheEntryLock { key => bail!("unsupported key type {key:?}"), }; - // TODO(conrad): verify iss, exp, nbf, etc... + let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + let payload = serde_json::from_slice::>(&payload) + .context("Provided authentication token is not a valid JWT encoding")?; + + tracing::debug!(?payload, "JWT signature valid with claims"); + + match (expected_audience, payload.audience) { + // check the audience matches + (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"), + // the audience is expected but is missing + (Some(_), None) => bail!("invalid JWT token audience"), + // we don't care for the audience field + (None, _) => {} + } + + let now = SystemTime::now(); + + if let Some(exp) = payload.expiration { + ensure!(now < exp + CLOCK_SKEW_LEEWAY); + } + + if let Some(nbf) = payload.not_before { + ensure!(nbf < now + CLOCK_SKEW_LEEWAY); + } Ok(()) } } impl JwkCache { - pub async fn check_jwt( + pub(crate) async fn check_jwt( &self, - endpoint: EndpointIdInt, - jwt: String, + ctx: &RequestMonitoring, + endpoint: EndpointId, + role_name: RoleName, + fetch: &F, + jwt: &str, ) -> Result<(), anyhow::Error> { // try with just a read lock first - let entry = self.map.get(&endpoint).as_deref().map(Arc::clone); - let entry = match entry { - Some(entry) => entry, - None => { - // acquire a write lock after to insert. - let entry = self.map.entry(endpoint).or_default(); - Arc::clone(&*entry) - } - }; + let key = (endpoint, role_name.clone()); + let entry = self.map.get(&key).as_deref().map(Arc::clone); + let entry = entry.unwrap_or_else(|| { + // acquire a write lock after to insert. + let entry = self.map.entry(key).or_default(); + Arc::clone(&*entry) + }); - let fetch = FetchAuthRulesFromCplane { endpoint }; - entry.check_jwt(jwt, &self.client, &fetch).await + entry + .check_jwt(ctx, jwt, &self.client, role_name, fetch) + .await } } @@ -315,13 +378,49 @@ fn verify_rsa_signature( /// #[derive(serde::Deserialize, serde::Serialize)] -struct JWTHeader<'a> { +struct JwtHeader<'a> { /// must be "JWT" + #[serde(rename = "typ")] typ: &'a str, /// must be a supported alg - alg: jose_jwa::Algorithm, + #[serde(rename = "alg")] + algorithm: jose_jwa::Algorithm, /// key id, must be provided for our usecase - kid: Option<&'a str>, + #[serde(rename = "kid")] + key_id: Option<&'a str>, +} + +/// +#[derive(serde::Deserialize, serde::Serialize, Debug)] +struct JwtPayload<'a> { + /// Audience - Recipient for which the JWT is intended + #[serde(rename = "aud")] + audience: Option<&'a str>, + /// Expiration - Time after which the JWT expires + #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + expiration: Option, + /// Not before - Time after which the JWT expires + #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + not_before: Option, + + // the following entries are only extracted for the sake of debug logging. + /// Issuer of the JWT + #[serde(rename = "iss")] + issuer: Option<&'a str>, + /// Subject of the JWT (the user) + #[serde(rename = "sub")] + subject: Option<&'a str>, + /// Unique token identifier + #[serde(rename = "jti")] + jwt_id: Option<&'a str>, + /// Unique session identifier + #[serde(rename = "sid")] + session_id: Option<&'a str>, +} + +fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let d = >::deserialize(d)?; + Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) } struct JwkRenewalPermit<'a> { @@ -388,6 +487,8 @@ impl Drop for JwkRenewalPermit<'_> { #[cfg(test)] mod tests { + use crate::RoleName; + use super::*; use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; @@ -399,6 +500,7 @@ mod tests { use hyper1::service::service_fn; use hyper_util::rt::TokioIo; use rand::rngs::OsRng; + use rsa::pkcs8::DecodePrivateKey; use signature::Signer; use tokio::net::TcpListener; @@ -416,8 +518,8 @@ mod tests { (sk, jwk) } - fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { - let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap(); + fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { + let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap(); let pk = sk.to_public_key().into(); let jwk = jose_jwk::Jwk { key: jose_jwk::Key::Rsa(pk), @@ -431,10 +533,10 @@ mod tests { } fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { - let header = JWTHeader { + let header = JwtHeader { typ: "JWT", - alg: jose_jwa::Algorithm::Signing(sig), - kid: Some(&kid), + algorithm: jose_jwa::Algorithm::Signing(sig), + key_id: Some(&kid), }; let body = typed_json::json! {{ "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600, @@ -468,10 +570,70 @@ mod tests { format!("{payload}.{sig}") } + // RSA key gen is slow.... + const RS1: &str = "-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y +aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA +SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA +cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X +mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s +PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM +nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9 +ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi +5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+ +KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz +1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP +We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony +UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu +GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+ +HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU +f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24 +DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY +9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF +/4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL +3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE +qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d +ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW +rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf +JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI +rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9 +8LMFlz1QPqSZYN/A/kOcLBfa3A== +-----END PRIVATE KEY----- +"; + const RS2: &str = "-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J +HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J +zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk +qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO +IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt +ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy +uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv +K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG +LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe +QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT +zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX +0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR ++Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1 +P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL +x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/ +FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg +iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M +4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA +iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj +N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB +uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U +qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP +WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB +rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne +X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL +5JiconnI5aLek0QVPoFaVXFa +-----END PRIVATE KEY----- +"; + #[tokio::test] async fn renew() { - let (rs1, jwk1) = new_rsa_jwk("1".into()); - let (rs2, jwk2) = new_rsa_jwk("2".into()); + let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into()); + let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into()); let (ec1, jwk3) = new_ec_jwk("3".into()); let (ec2, jwk4) = new_ec_jwk("4".into()); @@ -524,33 +686,40 @@ mod tests { struct Fetch(SocketAddr); impl FetchAuthRules for Fetch { - async fn fetch_auth_rules(&self) -> anyhow::Result { - Ok(AuthRules { - jwks_urls: vec![ - format!("http://{}/foo", self.0).parse().unwrap(), - format!("http://{}/bar", self.0).parse().unwrap(), - ], - }) + async fn fetch_auth_rules( + &self, + _role_name: RoleName, + ) -> anyhow::Result> { + Ok(vec![ + AuthRule { + id: "foo".to_owned(), + jwks_url: format!("http://{}/foo", self.0).parse().unwrap(), + audience: None, + }, + AuthRule { + id: "bar".to_owned(), + jwks_url: format!("http://{}/bar", self.0).parse().unwrap(), + audience: None, + }, + ]) } } + let role_name = RoleName::from("user"); + let jwk_cache = Arc::new(JwkCacheEntryLock::default()); - jwk_cache - .check_jwt(jwt1, &client, &Fetch(addr)) - .await - .unwrap(); - jwk_cache - .check_jwt(jwt2, &client, &Fetch(addr)) - .await - .unwrap(); - jwk_cache - .check_jwt(jwt3, &client, &Fetch(addr)) - .await - .unwrap(); - jwk_cache - .check_jwt(jwt4, &client, &Fetch(addr)) - .await - .unwrap(); + for token in [jwt1, jwt2, jwt3, jwt4] { + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &token, + &client, + role_name.clone(), + &Fetch(addr), + ) + .await + .unwrap(); + } } } diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs new file mode 100644 index 0000000000..8124f568cf --- /dev/null +++ b/proxy/src/auth/backend/local.rs @@ -0,0 +1,77 @@ +use std::{collections::HashMap, net::SocketAddr}; + +use anyhow::Context; +use arc_swap::ArcSwapOption; + +use crate::{ + compute::ConnCfg, + console::{ + messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, + NodeInfo, + }, + intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag}, + RoleName, +}; + +use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; + +pub struct LocalBackend { + pub(crate) jwks_cache: JwkCache, + pub(crate) node_info: NodeInfo, +} + +impl LocalBackend { + pub fn new(postgres_addr: SocketAddr) -> Self { + LocalBackend { + jwks_cache: JwkCache::default(), + node_info: NodeInfo { + config: { + let mut cfg = ConnCfg::new(); + cfg.host(&postgres_addr.ip().to_string()); + cfg.port(postgres_addr.port()); + cfg + }, + // TODO(conrad): make this better reflect compute info rather than endpoint info. + aux: MetricsAuxInfo { + endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), + project_id: ProjectIdTag::get_interner().get_or_intern("local"), + branch_id: BranchIdTag::get_interner().get_or_intern("local"), + cold_start_info: ColdStartInfo::WarmCached, + }, + allow_self_signed_compute: false, + }, + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct StaticAuthRules; + +pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); + +#[derive(Debug, Clone)] +pub struct JwksRoleSettings { + pub roles: HashMap, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, +} + +impl FetchAuthRules for StaticAuthRules { + async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result> { + let mappings = JWKS_ROLE_MAP.load(); + let role_mappings = mappings + .as_deref() + .and_then(|m| m.roles.get(&role_name)) + .context("JWKs settings for this role were not configured")?; + let mut rules = vec![]; + for setting in &role_mappings.jwks { + rules.push(AuthRule { + id: setting.id.clone(), + jwks_url: setting.jwks_url.clone(), + audience: setting.jwt_audience.clone(), + }); + } + + Ok(rules) + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/web.rs similarity index 86% rename from proxy/src/auth/backend/link.rs rename to proxy/src/auth/backend/web.rs index 95f4614736..58a4bef62e 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/web.rs @@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; #[derive(Debug, Error)] -pub enum LinkAuthError { +pub(crate) enum WebAuthError { #[error(transparent)] WaiterRegister(#[from] waiters::RegisterError), @@ -24,18 +24,18 @@ pub enum LinkAuthError { Io(#[from] std::io::Error), } -impl UserFacingError for LinkAuthError { +impl UserFacingError for WebAuthError { fn to_string_client(&self) -> String { "Internal error".to_string() } } -impl ReportableError for LinkAuthError { +impl ReportableError for WebAuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service, - LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service, - LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + Self::WaiterRegister(_) => crate::error::ErrorKind::Service, + Self::WaiterWait(_) => crate::error::ErrorKind::Service, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } @@ -52,7 +52,7 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { ) } -pub fn new_psql_session_id() -> String { +pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } @@ -74,7 +74,7 @@ pub(super) async fn authenticate( } }; - let span = info_span!("link", psql_session_id = &psql_session_id); + let span = info_span!("web", psql_session_id = &psql_session_id); let greeting = hello_message(link_uri, &psql_session_id); // Give user a URL to spawn a new database. @@ -87,7 +87,7 @@ pub(super) async fn authenticate( // Wait for web console response (see `mgmt`). info!(parent: &span, "waiting for console's reply..."); - let db_info = waiter.await.map_err(LinkAuthError::from)?; + let db_info = waiter.await.map_err(WebAuthError::from)?; client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 849e7d65e8..cba8601d14 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -16,7 +16,7 @@ use thiserror::Error; use tracing::{info, warn}; #[derive(Debug, Error, PartialEq, Eq, Clone)] -pub enum ComputeUserInfoParseError { +pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), @@ -51,20 +51,20 @@ impl ReportableError for ComputeUserInfoParseError { /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ComputeUserInfoMaybeEndpoint { - pub user: RoleName, - pub endpoint_id: Option, - pub options: NeonOptions, +pub(crate) struct ComputeUserInfoMaybeEndpoint { + pub(crate) user: RoleName, + pub(crate) endpoint_id: Option, + pub(crate) options: NeonOptions, } impl ComputeUserInfoMaybeEndpoint { #[inline] - pub fn endpoint(&self) -> Option<&str> { + pub(crate) fn endpoint(&self) -> Option<&str> { self.endpoint_id.as_deref() } } -pub fn endpoint_sni( +pub(crate) fn endpoint_sni( sni: &str, common_names: &HashSet, ) -> Result, ComputeUserInfoParseError> { @@ -83,7 +83,7 @@ pub fn endpoint_sni( } impl ComputeUserInfoMaybeEndpoint { - pub fn parse( + pub(crate) fn parse( ctx: &RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, @@ -130,9 +130,12 @@ impl ComputeUserInfoMaybeEndpoint { })) } // Invariant: project name may not contain certain characters. - (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) { - false => Err(ComputeUserInfoParseError::MalformedProjectName(name)), - true => Ok(name), + (a, b) => a.or(b).map(|name| { + if project_name_valid(name.as_ref()) { + Ok(name) + } else { + Err(ComputeUserInfoParseError::MalformedProjectName(name)) + } }), } .transpose()?; @@ -170,12 +173,12 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { +pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern)) } #[derive(Debug, Clone, Eq, PartialEq)] -pub enum IpPattern { +pub(crate) enum IpPattern { Subnet(ipnet::IpNet), Range(IpAddr, IpAddr), Single(IpAddr), @@ -535,4 +538,17 @@ mod tests { )); Ok(()) } + + #[test] + fn test_connection_blocker() { + fn check(v: serde_json::Value) -> bool { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let ip_list: Vec = serde_json::from_value(v).unwrap(); + check_peer_addr_is_in_list(&peer_addr, &ip_list) + } + + assert!(check(json!([]))); + assert!(check(json!(["127.0.0.1"]))); + assert!(!check(json!(["255.255.255.255"]))); + } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index acf7b4f6b6..f7e2b5296e 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -17,17 +17,20 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; /// Every authentication selector is supposed to implement this trait. -pub trait AuthMethod { +pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message /// containing auth method name and parameters, e.g. md5 salt. fn first_message(&self, channel_binding: bool) -> BeMessage<'_>; } /// Initial state of [`AuthFlow`]. -pub struct Begin; +pub(crate) struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring); +pub(crate) struct Scram<'a>( + pub(crate) &'a scram::ServerSecret, + pub(crate) &'a RequestMonitoring, +); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -44,7 +47,7 @@ impl AuthMethod for Scram<'_> { /// Use an ad hoc auth flow (for clients which don't support SNI) proposed in /// . -pub struct PasswordHack; +pub(crate) struct PasswordHack; impl AuthMethod for PasswordHack { #[inline(always)] @@ -55,10 +58,10 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword { - pub pool: Arc, - pub endpoint: EndpointIdInt, - pub secret: AuthSecret, +pub(crate) struct CleartextPassword { + pub(crate) pool: Arc, + pub(crate) endpoint: EndpointIdInt, + pub(crate) secret: AuthSecret, } impl AuthMethod for CleartextPassword { @@ -70,7 +73,7 @@ impl AuthMethod for CleartextPassword { /// This wrapper for [`PqStream`] performs client authentication. #[must_use] -pub struct AuthFlow<'a, S, State> { +pub(crate) struct AuthFlow<'a, S, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream>, /// State might contain ancillary data (see [`Self::begin`]). @@ -81,7 +84,7 @@ pub struct AuthFlow<'a, S, State> { /// Initial state of the stream wrapper. impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Create a new wrapper for client authentication. - pub fn new(stream: &'a mut PqStream>) -> Self { + pub(crate) fn new(stream: &'a mut PqStream>) -> Self { let tls_server_end_point = stream.get_ref().tls_server_end_point(); Self { @@ -92,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { } /// Move to the next step by sending auth method's name & params to client. - pub async fn begin(self, method: M) -> io::Result> { + pub(crate) async fn begin(self, method: M) -> io::Result> { self.stream .write_message(&method.first_message(self.tls_server_end_point.supported())) .await?; @@ -107,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { impl AuthFlow<'_, S, PasswordHack> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn get_password(self) -> super::Result { + pub(crate) async fn get_password(self) -> super::Result { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) @@ -126,7 +129,7 @@ impl AuthFlow<'_, S, PasswordHack> { impl AuthFlow<'_, S, CleartextPassword> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result> { + pub(crate) async fn authenticate(self) -> super::Result> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) @@ -151,7 +154,7 @@ impl AuthFlow<'_, S, CleartextPassword> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result> { + pub(crate) async fn authenticate(self) -> super::Result> { let Scram(secret, ctx) = self.state; // pause the timer while we communicate with the client diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 2ddf46fe25..8585b8ff48 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -1,5 +1,5 @@ //! Payload for ad hoc authentication method for clients that don't support SNI. -//! See the `impl` for [`super::backend::BackendType`]. +//! See the `impl` for [`super::backend::Backend`]. //! Read more: . //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. @@ -7,13 +7,13 @@ use bstr::ByteSlice; use crate::EndpointId; -pub struct PasswordHackPayload { - pub endpoint: EndpointId, - pub password: Vec, +pub(crate) struct PasswordHackPayload { + pub(crate) endpoint: EndpointId, + pub(crate) password: Vec, } impl PasswordHackPayload { - pub fn parse(bytes: &[u8]) -> Option { + pub(crate) fn parse(bytes: &[u8]) -> Option { // The format is `project=;` or `project=$`. let separators = [";", "$"]; for sep in separators { @@ -30,7 +30,7 @@ impl PasswordHackPayload { } } -pub fn parse_endpoint_param(bytes: &str) -> Option<&str> { +pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> { bytes .strip_prefix("project=") .or_else(|| bytes.strip_prefix("endpoint=")) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs new file mode 100644 index 0000000000..94365ddf05 --- /dev/null +++ b/proxy/src/bin/local_proxy.rs @@ -0,0 +1,325 @@ +use std::{ + net::SocketAddr, + path::{Path, PathBuf}, + pin::pin, + sync::Arc, + time::Duration, +}; + +use anyhow::{bail, ensure}; +use dashmap::DashMap; +use futures::{future::Either, FutureExt}; +use proxy::{ + auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP}, + cancellation::CancellationHandlerMain, + config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, + console::{locks::ApiLocks, messages::JwksRoleMapping}, + http::health_server::AppMetrics, + metrics::{Metrics, ThreadPoolMetrics}, + rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, + scram::threadpool::ThreadPool, + serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, +}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::Parser; +use tokio::{net::TcpListener, task::JoinSet}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, warn}; +use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; + +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct LocalProxyCliArgs { + /// listen for incoming metrics connections on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + metrics: String, + /// listen for incoming http connections on ip:port + #[clap(long)] + http: String, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// User rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + user_rps_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Address of the postgres server + #[clap(long, default_value = "127.0.0.1:5432")] + compute: SocketAddr, + /// File address of the local proxy config file + #[clap(long, default_value = "./localproxy.json")] + config_path: PathBuf, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 200)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + #[clap(long, default_value_t = 100)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 16)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let _logging_guard = proxy::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + info!("Version: {GIT_VERSION}"); + info!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = LocalProxyCliArgs::parse(); + let config = build_config(&args)?; + + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; + let http_listener = TcpListener::bind(args.http).await?; + let shutdown = CancellationToken::new(); + + // todo: should scale with CU + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { + rps: 10.0, + max: 100.0, + }, + 16, + )); + + refresh_config(args.config_path.clone()).await; + + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || { + refresh_config(args.config_path.clone()).map(Ok) + })); + maintenance_tasks.spawn(proxy::http::health_server::task_main( + metrics_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); + + let task = serverless::task_main( + config, + http_listener, + shutdown.clone(), + Arc::new(CancellationHandlerMain::new( + Arc::new(DashMap::new()), + None, + proxy::metrics::CancellationSource::Local, + )), + endpoint_rate_limiter, + ); + + match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {}, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((res, _)) => res?, + } + + Ok(()) +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + + let http_config = HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + gc_epoch: Duration::from_secs(60), + pool_shards: 2, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: false, + + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + + Ok(Box::leak(Box::new(ProxyConfig { + tls_config: None, + auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.compute), + )), + metric_collection: None, + allow_self_signed_compute: false, + http_config, + authentication_config: AuthenticationConfig { + thread_pool: ThreadPool::new(0), + scram_protocol_timeout: Duration::from_secs(10), + rate_limiter_enabled: false, + rate_limiter: BucketRateLimiter::new(vec![]), + rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, + }, + require_client_ip: false, + handshake_timeout: Duration::from_secs(10), + region: "local".into(), + wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, + connect_compute_locks, + connect_to_compute_retry_config: RetryConfig::parse( + RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES, + )?, + }))) +} + +async fn refresh_config(path: PathBuf) { + match refresh_config_inner(&path).await { + Ok(()) => {} + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } +} + +async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> { + let bytes = tokio::fs::read(&path).await?; + let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?; + + let mut settings = None; + + for mapping in data.roles.values_mut() { + for jwks in &mut mapping.jwks { + ensure!( + jwks.jwks_url.has_authority() + && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks.jwks_url + .host() + .is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks.jwks_url.set_username("").expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + jwks.jwks_url.set_password(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks.jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks.jwks_url.set_fragment(None); + jwks.jwks_url.query_pairs_mut().clear().finish(); + + if jwks.jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks.jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id)); + ensure!( + *pr == jwks.project_id, + "inconsistent project IDs configured" + ); + ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured"); + } + } + + if let Some((project_id, branch_id)) = settings { + JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings { + roles: data.roles, + project_id, + branch_id, + }))); + } + + Ok(()) +} diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 1038fa5116..20d2d3df9a 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), )); - let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token)); + let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async { + Ok(()) + })); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index d83a1f3bcf..2ac66ffe8c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -60,11 +60,15 @@ use clap::{Parser, ValueEnum}; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[derive(Clone, Debug, ValueEnum)] -enum AuthBackend { +enum AuthBackendType { Console, + // clap only shows the name, not the alias, in usage text. + // TODO: swap name/alias and deprecate "link" + #[value(name("link"), alias("web"))] + Web, + #[cfg(feature = "testing")] Postgres, - Link, } /// Neon proxy/router @@ -77,8 +81,8 @@ struct ProxyCliArgs { /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] proxy: String, - #[clap(value_enum, long, default_value_t = AuthBackend::Link)] - auth_backend: AuthBackend, + #[clap(value_enum, long, default_value_t = AuthBackendType::Web)] + auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] mgmt: String, @@ -88,7 +92,7 @@ struct ProxyCliArgs { /// listen for incoming wss connections on ip:port #[clap(long)] wss: Option, - /// redirect unauthenticated users to the given uri in case of link auth + /// redirect unauthenticated users to the given uri in case of web auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, /// cloud API endpoint for authenticating users @@ -148,7 +152,7 @@ struct ProxyCliArgs { disable_dynamic_rate_limiter: bool, /// Endpoint rate limiter max number of requests per second. /// - /// Provided in the form '@'. + /// Provided in the form `@`. /// Can be given multiple times for different bucket sizes. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, @@ -221,6 +225,10 @@ struct ProxyCliArgs { /// Whether to retry the wake_compute request #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -261,6 +269,12 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 64)] sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, } #[tokio::main] @@ -447,7 +461,10 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); + maintenance_tasks.spawn(proxy::handle_signals( + cancellation_token.clone(), + || async { Ok(()) }, + )); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { @@ -467,7 +484,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::BackendType::Console(api, _) = &config.auth_backend { + if let auth::Backend::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -572,7 +589,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } let auth_backend = match &args.auth_backend { - AuthBackend::Console => { + AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; @@ -621,18 +638,20 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { wake_compute_endpoint_rate_limiter, ); let api = console::provider::ConsoleBackend::Console(api); - auth::BackendType::Console(MaybeOwned::Owned(api), ()) + auth::Backend::Console(MaybeOwned::Owned(api), ()) } - #[cfg(feature = "testing")] - AuthBackend::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = console::provider::mock::Api::new(url); - let api = console::provider::ConsoleBackend::Postgres(api); - auth::BackendType::Console(MaybeOwned::Owned(api), ()) - } - AuthBackend::Link => { + + AuthBackendType::Web => { let url = args.uri.parse()?; - auth::BackendType::Link(MaybeOwned::Owned(url), ()) + auth::Backend::Web(MaybeOwned::Owned(url), ()) + } + + #[cfg(feature = "testing")] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy); + let api = console::provider::ConsoleBackend::Postgres(api); + auth::Backend::Console(MaybeOwned::Owned(api), ()) } }; @@ -669,6 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { thread_pool, @@ -676,6 +697,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, }; let config = Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index d1d4087241..6c168144a7 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,7 +1,7 @@ -pub mod common; -pub mod endpoints; -pub mod project_info; +pub(crate) mod common; +pub(crate) mod endpoints; +pub(crate) mod project_info; mod timed_lru; -pub use common::{Cache, Cached}; -pub use timed_lru::TimedLru; +pub(crate) use common::{Cache, Cached}; +pub(crate) use timed_lru::TimedLru; diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index 82c78e3eb2..b5caf94788 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut}; /// A generic trait which exposes types of cache's key and value, /// as well as the notion of cache entry invalidation. /// This is useful for [`Cached`]. -pub trait Cache { +pub(crate) trait Cache { /// Entry's key. type Key; @@ -29,21 +29,21 @@ impl Cache for &C { } /// Wrapper for convenient entry invalidation. -pub struct Cached::Value> { +pub(crate) struct Cached::Value> { /// Cache + lookup info. - pub token: Option<(C, C::LookupInfo)>, + pub(crate) token: Option<(C, C::LookupInfo)>, /// The value itself. - pub value: V, + pub(crate) value: V, } impl Cached { /// Place any entry into this wrapper; invalidation will be a no-op. - pub fn new_uncached(value: V) -> Self { + pub(crate) fn new_uncached(value: V) -> Self { Self { token: None, value } } - pub fn take_value(self) -> (Cached, V) { + pub(crate) fn take_value(self) -> (Cached, V) { ( Cached { token: self.token, @@ -53,7 +53,7 @@ impl Cached { ) } - pub fn map(self, f: impl FnOnce(V) -> U) -> Cached { + pub(crate) fn map(self, f: impl FnOnce(V) -> U) -> Cached { Cached { token: self.token, value: f(self.value), @@ -61,7 +61,7 @@ impl Cached { } /// Drop this entry from a cache if it's still there. - pub fn invalidate(self) -> V { + pub(crate) fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { cache.invalidate(info); } @@ -69,7 +69,7 @@ impl Cached { } /// Tell if this entry is actually cached. - pub fn cached(&self) -> bool { + pub(crate) fn cached(&self) -> bool { self.token.is_some() } } diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 8c851790c2..27121ce89e 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -28,7 +28,7 @@ use crate::{ }; #[derive(Deserialize, Debug, Clone)] -pub struct ControlPlaneEventKey { +pub(crate) struct ControlPlaneEventKey { endpoint_created: Option, branch_created: Option, project_created: Option, @@ -56,7 +56,7 @@ pub struct EndpointsCache { } impl EndpointsCache { - pub fn new(config: EndpointCacheConfig) -> Self { + pub(crate) fn new(config: EndpointCacheConfig) -> Self { Self { limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( config.limiter_info.clone(), @@ -68,7 +68,7 @@ impl EndpointsCache { ready: AtomicBool::new(false), } } - pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { + pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { if !self.ready.load(Ordering::Acquire) { return true; } @@ -242,6 +242,6 @@ mod tests { #[test] fn test() { let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; - let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + serde_json::from_str::(s).unwrap(); } } diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 10cc4ceee1..ceae74a9a0 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -24,7 +24,7 @@ use crate::{ use super::{Cache, Cached}; #[async_trait] -pub trait ProjectInfoCache { +pub(crate) trait ProjectInfoCache { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); async fn decrement_active_listeners(&self); @@ -37,7 +37,7 @@ struct Entry { } impl Entry { - pub fn new(value: T) -> Self { + pub(crate) fn new(value: T) -> Self { Self { created_at: Instant::now(), value, @@ -64,7 +64,7 @@ impl EndpointInfo { Some(t) => t < created_at, } } - pub fn get_role_secret( + pub(crate) fn get_role_secret( &self, role_name: RoleNameInt, valid_since: Instant, @@ -81,7 +81,7 @@ impl EndpointInfo { None } - pub fn get_allowed_ips( + pub(crate) fn get_allowed_ips( &self, valid_since: Instant, ignore_cache_since: Option, @@ -96,10 +96,10 @@ impl EndpointInfo { } None } - pub fn invalidate_allowed_ips(&mut self) { + pub(crate) fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } - pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { + pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { self.secret.remove(&role_name); } } @@ -178,7 +178,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } impl ProjectInfoCacheImpl { - pub fn new(config: ProjectInfoCacheOptions) -> Self { + pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self { Self { cache: DashMap::new(), project2ep: DashMap::new(), @@ -189,7 +189,7 @@ impl ProjectInfoCacheImpl { } } - pub fn get_role_secret( + pub(crate) fn get_role_secret( &self, endpoint_id: &EndpointId, role_name: &RoleName, @@ -212,7 +212,7 @@ impl ProjectInfoCacheImpl { } Some(Cached::new_uncached(value)) } - pub fn get_allowed_ips( + pub(crate) fn get_allowed_ips( &self, endpoint_id: &EndpointId, ) -> Option>>> { @@ -230,7 +230,7 @@ impl ProjectInfoCacheImpl { } Some(Cached::new_uncached(value)) } - pub fn insert_role_secret( + pub(crate) fn insert_role_secret( &self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt, @@ -247,7 +247,7 @@ impl ProjectInfoCacheImpl { entry.secret.insert(role_name, secret.into()); } } - pub fn insert_allowed_ips( + pub(crate) fn insert_allowed_ips( &self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt, @@ -274,13 +274,13 @@ impl ProjectInfoCacheImpl { let ttl_disabled_since_us = self .ttl_disabled_since_us .load(std::sync::atomic::Ordering::Relaxed); - let ignore_cache_since = if ttl_disabled_since_us != u64::MAX { + let ignore_cache_since = if ttl_disabled_since_us == u64::MAX { + None + } else { let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us); // We are fine if entry is not older than ttl or was added before we are getting notifications. valid_since = valid_since.min(ignore_cache_since); Some(ignore_cache_since) - } else { - None }; (valid_since, ignore_cache_since) } @@ -306,7 +306,7 @@ impl ProjectInfoCacheImpl { let mut removed = 0; let shard = self.project2ep.shards()[shard].write(); for (_, endpoints) in shard.iter() { - for endpoint in endpoints.get().iter() { + for endpoint in endpoints.get() { self.cache.remove(endpoint); removed += 1; } @@ -319,7 +319,7 @@ impl ProjectInfoCacheImpl { /// Lookup info for project info cache. /// This is used to invalidate cache entries. -pub struct CachedLookupInfo { +pub(crate) struct CachedLookupInfo { /// Search by this key. endpoint_id: EndpointIdInt, lookup_type: LookupType, diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 07fad56643..5b08d74696 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -16,7 +16,7 @@ use tracing::debug; // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; -use super::{common::Cached, *}; +use super::{common::Cached, timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: @@ -39,7 +39,7 @@ use super::{common::Cached, *}; /// /// * It's possible for an entry that has not yet expired entry to be evicted /// before expired items. That's a bit wasteful, but probably fine in practice. -pub struct TimedLru { +pub(crate) struct TimedLru { /// Cache's name for tracing. name: &'static str, @@ -72,7 +72,7 @@ struct Entry { impl TimedLru { /// Construct a new LRU cache with timed entries. - pub fn new( + pub(crate) fn new( name: &'static str, capacity: usize, ttl: Duration, @@ -207,11 +207,11 @@ impl TimedLru { } impl TimedLru { - pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) { + pub(crate) fn insert_ttl(&self, key: K, value: V, ttl: Duration) { self.insert_raw_ttl(key, value, ttl, false); } - pub fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { + pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { let (created_at, old) = self.insert_raw(key.clone(), value); let cached = Cached { @@ -221,22 +221,11 @@ impl TimedLru { (old, cached) } - - pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { - let (created_at, old) = self.insert_raw(key.clone(), value.clone()); - - let cached = Cached { - token: Some((self, LookupInfo { created_at, key })), - value, - }; - - (old, cached) - } } impl TimedLru { /// Retrieve a cached entry in convenient wrapper. - pub fn get(&self, key: &Q) -> Option> + pub(crate) fn get(&self, key: &Q) -> Option> where K: Borrow + Clone, Q: Hash + Eq + ?Sized, @@ -253,32 +242,10 @@ impl TimedLru { } }) } - - /// Retrieve a cached entry in convenient wrapper, ignoring its TTL. - pub fn get_ignoring_ttl(&self, key: &Q) -> Option> - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let mut cache = self.cache.lock(); - cache - .get(key) - .map(|entry| Cached::new_uncached(entry.value.clone())) - } - - /// Remove an entry from the cache. - pub fn remove(&self, key: &Q) -> Option - where - K: Borrow + Clone, - Q: Hash + Eq + ?Sized, - { - let mut cache = self.cache.lock(); - cache.remove(key).map(|entry| entry.value) - } } /// Lookup information for key invalidation. -pub struct LookupInfo { +pub(crate) struct LookupInfo { /// Time of creation of a cache [`Entry`]. /// We use this during invalidation lookups to prevent eviction of a newer /// entry sharing the same key (it might've been inserted by a different diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 34512e9f5b..71a2a16af8 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -18,7 +18,7 @@ use crate::{ pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; -pub type CancellationHandlerMainInternal = Option>>; +pub(crate) type CancellationHandlerMainInternal = Option>>; /// Enables serving `CancelRequest`s. /// @@ -32,7 +32,7 @@ pub struct CancellationHandler

{ } #[derive(Debug, Error)] -pub enum CancelError { +pub(crate) enum CancelError { #[error("{0}")] IO(#[from] std::io::Error), #[error("{0}")] @@ -53,7 +53,7 @@ impl ReportableError for CancelError { impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub fn get_session(self: Arc) -> Session

{ + pub(crate) fn get_session(self: Arc) -> Session

{ // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the @@ -81,7 +81,7 @@ impl CancellationHandler

{ } /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. - pub async fn cancel_session( + pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, @@ -155,14 +155,14 @@ pub struct CancelClosure { } impl CancelClosure { - pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { + pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { Self { socket_addr, cancel_token, } } /// Cancels the query running on user's compute node. - pub async fn try_cancel_query(self) -> Result<(), CancelError> { + pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; info!("query was cancelled"); @@ -171,7 +171,7 @@ impl CancelClosure { } /// Helper for registering query cancellation tokens. -pub struct Session

{ +pub(crate) struct Session

{ /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. @@ -181,7 +181,7 @@ pub struct Session

{ impl

Session

{ /// Store the cancel token for the given session. /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { + pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { info!("enabling query cancellation for this session"); self.cancellation_handler .map @@ -220,7 +220,8 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); + let handler = + CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index c071a59d58..8d3cb8ee3c 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -23,7 +23,7 @@ use tracing::{error, info, warn}; const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] -pub enum ConnectionError { +pub(crate) enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] @@ -86,22 +86,22 @@ impl ReportableError for ConnectionError { } /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. -pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; +pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. #[derive(Clone, Default)] -pub struct ConnCfg(Box); +pub(crate) struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { - pub fn new() -> Self { + pub(crate) fn new() -> Self { Self::default() } /// Reuse password or auth keys from the other config. - pub fn reuse_password(&mut self, other: Self) { + pub(crate) fn reuse_password(&mut self, other: Self) { if let Some(password) = other.get_password() { self.password(password); } @@ -111,7 +111,7 @@ impl ConnCfg { } } - pub fn get_host(&self) -> Result { + pub(crate) fn get_host(&self) -> Result { match self.0.get_hosts() { [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), // we should not have multiple address or unix addresses. @@ -122,15 +122,15 @@ impl ConnCfg { } /// Apply startup message params to the connection config. - pub fn set_startup_params(&mut self, params: &StartupMessageParams) { + pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) { // Only set `user` if it's not present in the config. - // Link auth flow takes username from the console's response. + // Web auth flow takes username from the console's response. if let (None, Some(user)) = (self.get_user(), params.get("user")) { self.user(user); } // Only set `dbname` if it's not present in the config. - // Link auth flow takes dbname from the console's response. + // Web auth flow takes dbname from the console's response. if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { self.dbname(dbname); } @@ -255,25 +255,25 @@ impl ConnCfg { } } -pub struct PostgresConnection { +pub(crate) struct PostgresConnection { /// Socket connected to a compute node. - pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< + pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< tokio::net::TcpStream, tokio_postgres_rustls::RustlsStream, >, /// PostgreSQL connection parameters. - pub params: std::collections::HashMap, + pub(crate) params: std::collections::HashMap, /// Query cancellation token. - pub cancel_closure: CancelClosure, + pub(crate) cancel_closure: CancelClosure, /// Labels for proxy's metrics. - pub aux: MetricsAuxInfo, + pub(crate) aux: MetricsAuxInfo, _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { /// Connect to a corresponding compute node. - pub async fn connect( + pub(crate) async fn connect( &self, ctx: &RequestMonitoring, allow_self_signed_compute: bool, @@ -286,7 +286,7 @@ impl ConnCfg { let client_config = if allow_self_signed_compute { // Allow all certificates for creating the connection - let verifier = Arc::new(AcceptEverythingVerifier) as Arc; + let verifier = Arc::new(AcceptEverythingVerifier); rustls::ClientConfig::builder() .dangerous() .with_custom_certificate_verifier(verifier) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a280aa88ce..373e4cf650 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -25,7 +25,7 @@ use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<'static, (), ()>, + pub auth_backend: auth::Backend<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, @@ -56,6 +56,8 @@ pub struct HttpConfig { pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, + pub max_request_size_bytes: u64, + pub max_response_size_bytes: usize, } pub struct AuthenticationConfig { @@ -64,6 +66,7 @@ pub struct AuthenticationConfig { pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, + pub ip_allowlist_check_enabled: bool, } impl TlsConfig { @@ -247,7 +250,7 @@ impl CertResolver { let common_name = pem.subject().to_string(); - // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names // and passed None instead, which blows up number of cases downstream code should handle. Proper coding @@ -318,7 +321,7 @@ impl CertResolver { // a) Instead of multi-cert approach use single cert with extra // domains listed in Subject Alternative Name (SAN). // b) Deploy separate proxy instances for extra domains. - self.default.as_ref().cloned() + self.default.clone() } } } diff --git a/proxy/src/console.rs b/proxy/src/console.rs index ea95e83437..87d8e781aa 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -10,7 +10,7 @@ pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { - pub use super::provider::{ApiCaches, NodeInfoCache}; + pub use super::provider::ApiCaches; } /// Various cache-related types. diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index ac66e116d0..9b66333cd4 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,31 +1,33 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::fmt::{self, Display}; use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::proxy::retry::CouldRetry; +use crate::RoleName; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. #[derive(Debug, Deserialize, Clone)] -pub struct ConsoleError { - pub error: Box, +pub(crate) struct ConsoleError { + pub(crate) error: Box, #[serde(skip)] - pub http_status_code: http::StatusCode, - pub status: Option, + pub(crate) http_status_code: http::StatusCode, + pub(crate) status: Option, } impl ConsoleError { - pub fn get_reason(&self) -> Reason { + pub(crate) fn get_reason(&self) -> Reason { self.status .as_ref() .and_then(|s| s.details.error_info.as_ref()) .map_or(Reason::Unknown, |e| e.reason) } - pub fn get_user_facing_message(&self) -> String { + pub(crate) fn get_user_facing_message(&self) -> String { use super::provider::errors::REQUEST_FAILED; self.status .as_ref() @@ -86,27 +88,28 @@ impl CouldRetry for ConsoleError { } #[derive(Debug, Deserialize, Clone)] -pub struct Status { - pub code: Box, - pub message: Box, - pub details: Details, +#[allow(dead_code)] +pub(crate) struct Status { + pub(crate) code: Box, + pub(crate) message: Box, + pub(crate) details: Details, } #[derive(Debug, Deserialize, Clone)] -pub struct Details { - pub error_info: Option, - pub retry_info: Option, - pub user_facing_message: Option, +pub(crate) struct Details { + pub(crate) error_info: Option, + pub(crate) retry_info: Option, + pub(crate) user_facing_message: Option, } #[derive(Copy, Clone, Debug, Deserialize)] -pub struct ErrorInfo { - pub reason: Reason, +pub(crate) struct ErrorInfo { + pub(crate) reason: Reason, // Schema could also have `metadata` field, but it's not structured. Skip it for now. } #[derive(Clone, Copy, Debug, Deserialize, Default)] -pub enum Reason { +pub(crate) enum Reason { /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles. #[serde(rename = "ROLE_PROTECTED")] RoleProtected, @@ -166,7 +169,7 @@ pub enum Reason { } impl Reason { - pub fn is_not_found(&self) -> bool { + pub(crate) fn is_not_found(self) -> bool { matches!( self, Reason::ResourceNotFound @@ -176,7 +179,7 @@ impl Reason { ) } - pub fn can_retry(&self) -> bool { + pub(crate) fn can_retry(self) -> bool { match self { // do not retry role protected errors // not a transitive error @@ -206,22 +209,23 @@ impl Reason { } #[derive(Copy, Clone, Debug, Deserialize)] -pub struct RetryInfo { - pub retry_delay_ms: u64, +#[allow(dead_code)] +pub(crate) struct RetryInfo { + pub(crate) retry_delay_ms: u64, } #[derive(Debug, Deserialize, Clone)] -pub struct UserFacingMessage { - pub message: Box, +pub(crate) struct UserFacingMessage { + pub(crate) message: Box, } /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. /// Returned by the `/proxy_get_role_secret` API method. #[derive(Deserialize)] -pub struct GetRoleSecret { - pub role_secret: Box, - pub allowed_ips: Option>, - pub project_id: Option, +pub(crate) struct GetRoleSecret { + pub(crate) role_secret: Box, + pub(crate) allowed_ips: Option>, + pub(crate) project_id: Option, } // Manually implement debug to omit sensitive info. @@ -234,21 +238,21 @@ impl fmt::Debug for GetRoleSecret { /// Response which holds compute node's `host:port` pair. /// Returned by the `/proxy_wake_compute` API method. #[derive(Debug, Deserialize)] -pub struct WakeCompute { - pub address: Box, - pub aux: MetricsAuxInfo, +pub(crate) struct WakeCompute { + pub(crate) address: Box, + pub(crate) aux: MetricsAuxInfo, } -/// Async response which concludes the link auth flow. +/// Async response which concludes the web auth flow. /// Also known as `kickResponse` in the console. #[derive(Debug, Deserialize)] -pub struct KickSession<'a> { +pub(crate) struct KickSession<'a> { /// Session ID is assigned by the proxy. - pub session_id: &'a str, + pub(crate) session_id: &'a str, /// Compute node connection params. #[serde(deserialize_with = "KickSession::parse_db_info")] - pub result: DatabaseInfo, + pub(crate) result: DatabaseInfo, } impl KickSession<'_> { @@ -271,15 +275,15 @@ impl KickSession<'_> { /// Compute node connection params. #[derive(Deserialize)] -pub struct DatabaseInfo { - pub host: Box, - pub port: u16, - pub dbname: Box, - pub user: Box, +pub(crate) struct DatabaseInfo { + pub(crate) host: Box, + pub(crate) port: u16, + pub(crate) dbname: Box, + pub(crate) user: Box, /// Console always provides a password, but it might /// be inconvenient for debug with local PG instance. - pub password: Option>, - pub aux: MetricsAuxInfo, + pub(crate) password: Option>, + pub(crate) aux: MetricsAuxInfo, } // Manually implement debug to omit sensitive info. @@ -297,12 +301,12 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. #[derive(Debug, Deserialize, Clone)] -pub struct MetricsAuxInfo { - pub endpoint_id: EndpointIdInt, - pub project_id: ProjectIdInt, - pub branch_id: BranchIdInt, +pub(crate) struct MetricsAuxInfo { + pub(crate) endpoint_id: EndpointIdInt, + pub(crate) project_id: ProjectIdInt, + pub(crate) branch_id: BranchIdInt, #[serde(default)] - pub cold_start_info: ColdStartInfo, + pub(crate) cold_start_info: ColdStartInfo, } #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] @@ -329,7 +333,7 @@ pub enum ColdStartInfo { } impl ColdStartInfo { - pub fn as_str(&self) -> &'static str { + pub(crate) fn as_str(self) -> &'static str { match self { ColdStartInfo::Unknown => "unknown", ColdStartInfo::Warm => "warm", @@ -341,6 +345,26 @@ impl ColdStartInfo { } } +#[derive(Debug, Deserialize, Clone)] +pub struct JwksRoleMapping { + pub roles: HashMap, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct EndpointJwksResponse { + pub jwks: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct JwksSettings { + pub id: String, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + pub jwks_url: url::Url, + pub provider_name: String, + pub jwt_audience: Option, +} + #[cfg(test)] mod tests { use super::*; @@ -371,7 +395,7 @@ mod tests { } } }); - let _: KickSession<'_> = serde_json::from_str(&json.to_string())?; + serde_json::from_str::>(&json.to_string())?; Ok(()) } @@ -379,7 +403,7 @@ mod tests { #[test] fn parse_db_info() -> anyhow::Result<()> { // with password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -389,7 +413,7 @@ mod tests { }))?; // without password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -398,7 +422,7 @@ mod tests { }))?; // new field (forward compatibility) - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -417,7 +441,7 @@ mod tests { "address": "0.0.0.0", "aux": dummy_aux(), }); - let _: WakeCompute = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } @@ -427,18 +451,18 @@ mod tests { let json = json!({ "role_secret": "secret", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], "project_id": "project", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 82d5033aab..ee5f83ee76 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -14,18 +14,18 @@ use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. -pub fn get_waiter( +pub(crate) fn get_waiter( psql_session_id: impl Into, ) -> Result, waiters::RegisterError> { CPLANE_WAITERS.register(psql_session_id.into()) } -pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { +pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Console management API listener task. -/// It spawns console response handlers needed for the link auth. +/// It spawns console response handlers needed for the web auth. pub async fn task_main(listener: TcpListener) -> anyhow::Result { scopeguard::defer! { info!("mgmt has shut down"); @@ -74,11 +74,11 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = DatabaseInfo; +pub(crate) type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; -#[async_trait::async_trait] + impl postgres_backend::Handler for MgmtHandler { async fn process_query( &mut self, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index cc2ee10062..12a6e2f12a 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -23,7 +23,7 @@ use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; -pub mod errors { +pub(crate) mod errors { use crate::{ console::messages::{self, ConsoleError, Reason}, error::{io_error, ErrorKind, ReportableError, UserFacingError}, @@ -34,11 +34,11 @@ pub mod errors { use super::ApiLockError; /// A go-to error message which doesn't leak any detail. - pub const REQUEST_FAILED: &str = "Console request failed"; + pub(crate) const REQUEST_FAILED: &str = "Console request failed"; /// Common console API error. #[derive(Debug, Error)] - pub enum ApiError { + pub(crate) enum ApiError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] Console(ConsoleError), @@ -50,7 +50,7 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn get_reason(&self) -> messages::Reason { + pub(crate) fn get_reason(&self) -> messages::Reason { match self { ApiError::Console(e) => e.get_reason(), ApiError::Transport(_) => messages::Reason::Unknown, @@ -146,7 +146,7 @@ pub mod errors { } #[derive(Debug, Error)] - pub enum GetAuthInfoError { + pub(crate) enum GetAuthInfoError { // We shouldn't include the actual secret here. #[error("Console responded with a malformed auth secret")] BadSecret, @@ -183,7 +183,7 @@ pub mod errors { } #[derive(Debug, Error)] - pub enum WakeComputeError { + pub(crate) enum WakeComputeError { #[error("Console responded with a malformed compute address: {0}")] BadComputeAddress(Box), @@ -247,7 +247,7 @@ pub mod errors { /// Auth secret which is managed by the cloud. #[derive(Clone, Eq, PartialEq, Debug)] -pub enum AuthSecret { +pub(crate) enum AuthSecret { #[cfg(any(test, feature = "testing"))] /// Md5 hash of user's password. Md5([u8; 16]), @@ -257,32 +257,32 @@ pub enum AuthSecret { } #[derive(Default)] -pub struct AuthInfo { - pub secret: Option, +pub(crate) struct AuthInfo { + pub(crate) secret: Option, /// List of IP addresses allowed for the autorization. - pub allowed_ips: Vec, + pub(crate) allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub(crate) project_id: Option, } /// Info for establishing a connection to a compute node. /// This is what we get after auth succeeded, but not before! #[derive(Clone)] -pub struct NodeInfo { +pub(crate) struct NodeInfo { /// Compute node connection params. /// It's sad that we have to clone this, but this will improve /// once we migrate to a bespoke connection logic. - pub config: compute::ConnCfg, + pub(crate) config: compute::ConnCfg, /// Labels for proxy's metrics. - pub aux: MetricsAuxInfo, + pub(crate) aux: MetricsAuxInfo, /// Whether we should accept self-signed certificates (for testing) - pub allow_self_signed_compute: bool, + pub(crate) allow_self_signed_compute: bool, } impl NodeInfo { - pub async fn connect( + pub(crate) async fn connect( &self, ctx: &RequestMonitoring, timeout: Duration, @@ -296,23 +296,24 @@ impl NodeInfo { ) .await } - pub fn reuse_settings(&mut self, other: Self) { + pub(crate) fn reuse_settings(&mut self, other: Self) { self.allow_self_signed_compute = other.allow_self_signed_compute; self.config.reuse_password(other.config); } - pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) { + pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) { match keys { ComputeCredentialKeys::Password(password) => self.config.password(password), ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), + ComputeCredentialKeys::None => &mut self.config, }; } } -pub type NodeInfoCache = TimedLru>>; -pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; -pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; -pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub(crate) type NodeInfoCache = TimedLru>>; +pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; +pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; +pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. @@ -349,6 +350,7 @@ pub enum ConsoleBackend { Postgres(mock::Api), /// Internal testing #[cfg(test)] + #[allow(private_interfaces)] Test(Box), } @@ -401,7 +403,7 @@ impl Api for ConsoleBackend { /// Various caches for [`console`](super). pub struct ApiCaches { /// Cache for the `wake_compute` API method. - pub node_info: NodeInfoCache, + pub(crate) node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, /// List of all valid endpoints. @@ -438,7 +440,7 @@ pub struct ApiLocks { } #[derive(Debug, thiserror::Error)] -pub enum ApiLockError { +pub(crate) enum ApiLockError { #[error("timeout acquiring resource permit")] TimeoutError(#[from] tokio::time::error::Elapsed), } @@ -470,7 +472,7 @@ impl ApiLocks { }) } - pub async fn get_permit(&self, key: &K) -> Result { + pub(crate) async fn get_permit(&self, key: &K) -> Result { if self.config.initial_limit == 0 { return Ok(WakeComputePermit { permit: Token::disabled(), @@ -530,18 +532,18 @@ impl ApiLocks { } } -pub struct WakeComputePermit { +pub(crate) struct WakeComputePermit { permit: Token, } impl WakeComputePermit { - pub fn should_check_cache(&self) -> bool { + pub(crate) fn should_check_cache(&self) -> bool { !self.permit.is_disabled() } - pub fn release(self, outcome: Outcome) { + pub(crate) fn release(self, outcome: Outcome) { self.permit.release(outcome); } - pub fn release_result(self, res: Result) -> Result { + pub(crate) fn release_result(self, res: Result) -> Result { match res { Ok(_) => self.release(Outcome::Success), Err(_) => self.release(Outcome::Overload), diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 2093da7562..1b77418de6 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -41,14 +41,18 @@ impl From for ApiError { #[derive(Clone)] pub struct Api { endpoint: ApiUrl, + ip_allowlist_check_enabled: bool, } impl Api { - pub fn new(endpoint: ApiUrl) -> Self { - Self { endpoint } + pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self { + Self { + endpoint, + ip_allowlist_check_enabled, + } } - pub fn url(&self) -> &str { + pub(crate) fn url(&self) -> &str { self.endpoint.as_str() } @@ -64,7 +68,8 @@ impl Api { tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); - let secret = match get_execute_postgres_query( + + let secret = if let Some(entry) = get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", &[&&*user_info.user], @@ -72,31 +77,33 @@ impl Api { ) .await? { - Some(entry) => { - info!("got a secret: {entry}"); // safe since it's not a prod scenario - let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); - secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) - } - None => { - warn!("user '{}' does not exist", user_info.user); - None - } + info!("got a secret: {entry}"); // safe since it's not a prod scenario + let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); + secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) + } else { + warn!("user '{}' does not exist", user_info.user); + None }; - let allowed_ips = match get_execute_postgres_query( - &client, - "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", - &[&user_info.endpoint.as_str()], - "allowed_ips", - ) - .await? - { - Some(s) => { - info!("got allowed_ips: {s}"); - s.split(',') - .map(|s| IpPattern::from_str(s).unwrap()) - .collect() + + let allowed_ips = if self.ip_allowlist_check_enabled { + match get_execute_postgres_query( + &client, + "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", + &[&user_info.endpoint.as_str()], + "allowed_ips", + ) + .await? + { + Some(s) => { + info!("got allowed_ips: {s}"); + s.split(',') + .map(|s| IpPattern::from_str(s).unwrap()) + .collect() + } + None => vec![], } - None => vec![], + } else { + vec![] }; Ok((secret, allowed_ips)) @@ -142,12 +149,11 @@ async fn get_execute_postgres_query( let rows = client.query(query, params).await?; // We can get at most one row, because `rolname` is unique. - let row = match rows.first() { - Some(row) => row, + let Some(row) = rows.first() else { // This means that the user doesn't exist, so there can be no secret. // However, this is still a *valid* outcome which is very similar // to getting `404 Not found` from the Neon console. - None => return Ok(None), + return Ok(None); }; let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 7eda238b66..b004bf4ecf 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -25,8 +25,8 @@ use tracing::{debug, error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - pub locks: &'static ApiLocks, - pub wake_compute_endpoint_rate_limiter: Arc, + pub(crate) locks: &'static ApiLocks, + pub(crate) wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -38,10 +38,7 @@ impl Api { locks: &'static ApiLocks, wake_compute_endpoint_rate_limiter: Arc, ) -> Self { - let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { - Ok(v) => v, - Err(_) => "".to_string(), - }; + let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default(); Self { endpoint, caches, @@ -51,7 +48,7 @@ impl Api { } } - pub fn url(&self) -> &str { + pub(crate) fn url(&self) -> &str { self.endpoint.url().as_str() } @@ -96,10 +93,10 @@ impl Api { // Error 404 is special: it's ok not to have a secret. // TODO(anna): retry Err(e) => { - if e.get_reason().is_not_found() { - return Ok(AuthInfo::default()); + return if e.get_reason().is_not_found() { + Ok(AuthInfo::default()) } else { - return Err(e.into()); + Err(e.into()) } } }; diff --git a/proxy/src/context.rs b/proxy/src/context.rs index cafbdedc15..c013218ad9 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -6,7 +6,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info, info_span, Span}; +use tracing::{debug, field::display, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; @@ -22,8 +22,9 @@ use self::parquet::RequestData; pub mod parquet; -pub static LOG_CHAN: OnceCell> = OnceCell::new(); -pub static LOG_CHAN_DISCONNECT: OnceCell> = OnceCell::new(); +pub(crate) static LOG_CHAN: OnceCell> = OnceCell::new(); +pub(crate) static LOG_CHAN_DISCONNECT: OnceCell> = + OnceCell::new(); /// Context data for a single request to connect to a database. /// @@ -38,12 +39,12 @@ pub struct RequestMonitoring( ); struct RequestMonitoringInner { - pub peer_addr: IpAddr, - pub session_id: Uuid, - pub protocol: Protocol, + pub(crate) peer_addr: IpAddr, + pub(crate) session_id: Uuid, + pub(crate) protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, - pub span: Span, + pub(crate) span: Span, // filled in as they are discovered project: Option, @@ -63,15 +64,15 @@ struct RequestMonitoringInner { sender: Option>, // This sender is only used to log the length of session in case of success. disconnect_sender: Option>, - pub latency_timer: LatencyTimer, + pub(crate) latency_timer: LatencyTimer, // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. rejected: Option, disconnect_timestamp: Option>, } #[derive(Clone, Debug)] -pub enum AuthMethod { - // aka link aka passwordless +pub(crate) enum AuthMethod { + // aka passwordless, fka link Web, ScramSha256, ScramSha256Plus, @@ -125,11 +126,11 @@ impl RequestMonitoring { } #[cfg(test)] - pub fn test() -> Self { + pub(crate) fn test() -> Self { RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } - pub fn console_application_name(&self) -> String { + pub(crate) fn console_application_name(&self) -> String { let this = self.0.try_lock().expect("should not deadlock"); format!( "{}/{}", @@ -138,19 +139,19 @@ impl RequestMonitoring { ) } - pub fn set_rejected(&self, rejected: bool) { + pub(crate) fn set_rejected(&self, rejected: bool) { let mut this = self.0.try_lock().expect("should not deadlock"); this.rejected = Some(rejected); } - pub fn set_cold_start_info(&self, info: ColdStartInfo) { + pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) { self.0 .try_lock() .expect("should not deadlock") .set_cold_start_info(info); } - pub fn set_db_options(&self, options: StartupMessageParams) { + pub(crate) fn set_db_options(&self, options: StartupMessageParams) { let mut this = self.0.try_lock().expect("should not deadlock"); this.set_application(options.get("application_name").map(SmolStr::from)); if let Some(user) = options.get("user") { @@ -163,7 +164,7 @@ impl RequestMonitoring { this.pg_options = Some(options); } - pub fn set_project(&self, x: MetricsAuxInfo) { + pub(crate) fn set_project(&self, x: MetricsAuxInfo) { let mut this = self.0.try_lock().expect("should not deadlock"); if this.endpoint_id.is_none() { this.set_endpoint_id(x.endpoint_id.as_str().into()); @@ -173,33 +174,33 @@ impl RequestMonitoring { this.set_cold_start_info(x.cold_start_info); } - pub fn set_project_id(&self, project_id: ProjectIdInt) { + pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) { let mut this = self.0.try_lock().expect("should not deadlock"); this.project = Some(project_id); } - pub fn set_endpoint_id(&self, endpoint_id: EndpointId) { + pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) { self.0 .try_lock() .expect("should not deadlock") .set_endpoint_id(endpoint_id); } - pub fn set_dbname(&self, dbname: DbName) { + pub(crate) fn set_dbname(&self, dbname: DbName) { self.0 .try_lock() .expect("should not deadlock") .set_dbname(dbname); } - pub fn set_user(&self, user: RoleName) { + pub(crate) fn set_user(&self, user: RoleName) { self.0 .try_lock() .expect("should not deadlock") .set_user(user); } - pub fn set_auth_method(&self, auth_method: AuthMethod) { + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); } @@ -211,7 +212,7 @@ impl RequestMonitoring { .has_private_peer_addr() } - pub fn set_error_kind(&self, kind: ErrorKind) { + pub(crate) fn set_error_kind(&self, kind: ErrorKind) { let mut this = self.0.try_lock().expect("should not deadlock"); // Do not record errors from the private address to metrics. if !this.has_private_peer_addr() { @@ -237,30 +238,30 @@ impl RequestMonitoring { .log_connect(); } - pub fn protocol(&self) -> Protocol { + pub(crate) fn protocol(&self) -> Protocol { self.0.try_lock().expect("should not deadlock").protocol } - pub fn span(&self) -> Span { + pub(crate) fn span(&self) -> Span { self.0.try_lock().expect("should not deadlock").span.clone() } - pub fn session_id(&self) -> Uuid { + pub(crate) fn session_id(&self) -> Uuid { self.0.try_lock().expect("should not deadlock").session_id } - pub fn peer_addr(&self) -> IpAddr { + pub(crate) fn peer_addr(&self) -> IpAddr { self.0.try_lock().expect("should not deadlock").peer_addr } - pub fn cold_start_info(&self) -> ColdStartInfo { + pub(crate) fn cold_start_info(&self) -> ColdStartInfo { self.0 .try_lock() .expect("should not deadlock") .cold_start_info } - pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> { + pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> { LatencyTimerPause { ctx: self, start: tokio::time::Instant::now(), @@ -268,7 +269,7 @@ impl RequestMonitoring { } } - pub fn success(&self) { + pub(crate) fn success(&self) { self.0 .try_lock() .expect("should not deadlock") @@ -277,7 +278,7 @@ impl RequestMonitoring { } } -pub struct LatencyTimerPause<'a> { +pub(crate) struct LatencyTimerPause<'a> { ctx: &'a RequestMonitoring, start: tokio::time::Instant, waiting_for: Waiting, @@ -361,7 +362,9 @@ impl RequestMonitoringInner { }); } if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } @@ -370,7 +373,9 @@ impl RequestMonitoringInner { // Here we log the length of the session. self.disconnect_timestamp = Some(Utc::now()); if let Some(tx) = self.disconnect_sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e5962b35fa..9f6f83022e 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -62,8 +62,8 @@ pub struct ParquetUploadArgs { // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up -pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; -pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; +pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. @@ -73,7 +73,7 @@ pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // * after each rowgroup write, we check the length of the file and upload to s3 if large enough #[derive(parquet_derive::ParquetRecordWriter)] -pub struct RequestData { +pub(crate) struct RequestData { region: &'static str, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones @@ -290,7 +290,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: Writer = upload_parquet(w, len, &storage).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) @@ -598,49 +598,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) - ] - ); - - tmpdir.close().unwrap(); - } - - #[tokio::test] - async fn verify_parquet_min_compression() { - let tmpdir = camino_tempfile::tempdir().unwrap(); - - let config = ParquetConfig { - propeties: Arc::new( - WriterProperties::builder() - .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default())) - .build(), - ), - rows_per_group: 2_000, - file_size: 1_000_000, - max_duration: time::Duration::from_secs(20 * 60), - test_remote_failures: 0, - }; - - let rx = random_stream(50_000); - let file_stats = run_test(tmpdir.path(), config, rx).await; - - // with compression, there are fewer files with more rows per file - assert_eq!( - file_stats, - [ - (1223214, 5, 10000), - (1229364, 5, 10000), - (1231158, 5, 10000), - (1230520, 5, 10000), - (1221798, 5, 10000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -672,11 +638,11 @@ mod tests { assert_eq!( file_stats, [ - (1208861, 5, 10000), - (1208592, 5, 10000), - (1208885, 5, 10000), - (1208873, 5, 10000), - (1209128, 5, 10000) + (1203465, 5, 10000), + (1203189, 5, 10000), + (1203490, 5, 10000), + (1203475, 5, 10000), + (1203729, 5, 10000) ] ); @@ -701,15 +667,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -746,7 +712,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)] + [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index fdfe50a494..53f9f75c5b 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -3,12 +3,12 @@ use std::{error::Error as StdError, fmt, io}; use measured::FixedCardinalityLabel; /// Upcast (almost) any error into an opaque [`io::Error`]. -pub fn io_error(e: impl Into>) -> io::Error { +pub(crate) fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } /// A small combinator for pluggable error logging. -pub fn log_error(e: E) -> E { +pub(crate) fn log_error(e: E) -> E { tracing::error!("{e}"); e } @@ -19,7 +19,7 @@ pub fn log_error(e: E) -> E { /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. -pub trait UserFacingError: ReportableError { +pub(crate) trait UserFacingError: ReportableError { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly @@ -64,7 +64,7 @@ pub enum ErrorKind { } impl ErrorKind { - pub fn to_metric_label(&self) -> &'static str { + pub(crate) fn to_metric_label(self) -> &'static str { match self { ErrorKind::User => "user", ErrorKind::ClientDisconnect => "clientdisconnect", @@ -78,7 +78,7 @@ impl ErrorKind { } } -pub trait ReportableError: fmt::Display + Send + 'static { +pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 1f1dd8c415..c77d95f47d 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -12,9 +12,9 @@ use http_body_util::BodyExt; use hyper1::body::Body; use serde::de::DeserializeOwned; -pub use reqwest::{Request, Response, StatusCode}; -pub use reqwest_middleware::{ClientWithMiddleware, Error}; -pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +pub(crate) use reqwest::{Request, Response}; +pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; +pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use crate::{ metrics::{ConsoleRequest, Metrics}, @@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware { .build() } -pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { +pub(crate) fn new_client_with_timeout( + request_timeout: Duration, + total_retry_duration: Duration, +) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() - .timeout(default_timout) + .timeout(request_timeout) .build() .expect("Failed to create http client with timeout"); let retry_policy = - ExponentialBackoff::builder().build_with_total_retry_duration(default_timout); + ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration); reqwest_middleware::ClientBuilder::new(timeout_client) .with(reqwest_tracing::TracingMiddleware::default()) @@ -77,20 +80,20 @@ impl Endpoint { } #[inline(always)] - pub fn url(&self) -> &ApiUrl { + pub(crate) fn url(&self) -> &ApiUrl { &self.endpoint } /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub fn get(&self, path: &str) -> RequestBuilder { + pub(crate) fn get(&self, path: &str) -> RequestBuilder { let mut url = self.endpoint.clone(); url.path_segments_mut().push(path); self.client.get(url.into_inner()) } /// Execute a [request](reqwest::Request). - pub async fn execute(&self, request: Request) -> Result { + pub(crate) async fn execute(&self, request: Request) -> Result { let _timer = Metrics::get() .proxy .console_request_latency @@ -102,7 +105,7 @@ impl Endpoint { } } -pub async fn parse_json_body_with_limit( +pub(crate) async fn parse_json_body_with_limit( mut b: impl Body + Unpin, limit: usize, ) -> anyhow::Result { diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index d418caa511..e5144cfe2e 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -29,10 +29,10 @@ impl std::fmt::Display for InternedString { } impl InternedString { - pub fn as_str(&self) -> &'static str { + pub(crate) fn as_str(&self) -> &'static str { Id::get_interner().inner.resolve(&self.inner) } - pub fn get(s: &str) -> Option { + pub(crate) fn get(s: &str) -> Option { Id::get_interner().get(s) } } @@ -78,7 +78,7 @@ impl serde::Serialize for InternedString { } impl StringInterner { - pub fn new() -> Self { + pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), @@ -90,26 +90,24 @@ impl StringInterner { } } - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - pub fn len(&self) -> usize { + #[cfg(test)] + fn len(&self) -> usize { self.inner.len() } - pub fn current_memory_usage(&self) -> usize { + #[cfg(test)] + fn current_memory_usage(&self) -> usize { self.inner.current_memory_usage() } - pub fn get_or_intern(&self, s: &str) -> InternedString { + pub(crate) fn get_or_intern(&self, s: &str) -> InternedString { InternedString { inner: self.inner.get_or_intern(s), _id: PhantomData, } } - pub fn get(&self, s: &str) -> Option> { + pub(crate) fn get(&self, s: &str) -> Option> { Some(InternedString { inner: self.inner.get(s)?, _id: PhantomData, @@ -132,14 +130,14 @@ impl Default for StringInterner { } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub struct RoleNameTag; +pub(crate) struct RoleNameTag; impl InternId for RoleNameTag { fn get_interner() -> &'static StringInterner { - pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } -pub type RoleNameInt = InternedString; +pub(crate) type RoleNameInt = InternedString; impl From<&RoleName> for RoleNameInt { fn from(value: &RoleName) -> Self { RoleNameTag::get_interner().get_or_intern(value) @@ -150,7 +148,7 @@ impl From<&RoleName> for RoleNameInt { pub struct EndpointIdTag; impl InternId for EndpointIdTag { fn get_interner() -> &'static StringInterner { - pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } @@ -170,7 +168,7 @@ impl From for EndpointIdInt { pub struct BranchIdTag; impl InternId for BranchIdTag { fn get_interner() -> &'static StringInterner { - pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } @@ -190,7 +188,7 @@ impl From for BranchIdInt { pub struct ProjectIdTag; impl InternId for ProjectIdTag { fn get_interner() -> &'static StringInterner { - pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } @@ -217,7 +215,7 @@ mod tests { struct MyId; impl InternId for MyId { fn get_interner() -> &'static StringInterner { - pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + pub(crate) static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ea92eaaa55..0070839aa8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,6 +1,88 @@ -#![deny(clippy::undocumented_unsafe_blocks)] +// rustc lints/lint groups +// https://doc.rust-lang.org/rustc/lints/groups.html +#![deny( + deprecated, + future_incompatible, + let_underscore, + nonstandard_style, + rust_2024_compatibility +)] +#![warn(clippy::all, clippy::pedantic, clippy::cargo)] +// List of denied lints from the clippy::restriction group. +// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction +#![warn( + clippy::undocumented_unsafe_blocks, + // TODO: Enable once all individual checks are enabled. + //clippy::as_conversions, + clippy::dbg_macro, + clippy::empty_enum_variants_with_brackets, + clippy::exit, + clippy::float_cmp_const, + clippy::lossy_float_literal, + clippy::macro_use_imports, + clippy::manual_ok_or, + // TODO: consider clippy::map_err_ignore + // TODO: consider clippy::mem_forget + clippy::rc_mutex, + clippy::rest_pat_in_fully_bound_structs, + clippy::string_add, + clippy::string_to_string, + clippy::todo, + // TODO: consider clippy::unimplemented + // TODO: consider clippy::unwrap_used +)] +// List of permanently allowed lints. +#![allow( + // It's ok to cast bool to u8, etc. + clippy::cast_lossless, + // Seems unavoidable. + clippy::multiple_crate_versions, + // While #[must_use] is a great feature this check is too noisy. + clippy::must_use_candidate, + // Inline consts, structs, fns, imports, etc. are ok if they're used by + // the following statement(s). + clippy::items_after_statements, +)] +// List of temporarily allowed lints. +// TODO: fix code and reduce list or move to permanent list above. +#![expect( + clippy::cargo_common_metadata, + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + clippy::cast_precision_loss, + clippy::cast_sign_loss, + clippy::doc_markdown, + clippy::inline_always, + clippy::match_same_arms, + clippy::match_wild_err_arm, + clippy::missing_errors_doc, + clippy::missing_panics_doc, + clippy::module_name_repetitions, + clippy::needless_pass_by_value, + clippy::redundant_closure_for_method_calls, + clippy::similar_names, + clippy::single_match_else, + clippy::struct_excessive_bools, + clippy::struct_field_names, + clippy::too_many_lines, + clippy::unused_self +)] +#![cfg_attr( + any(test, feature = "testing"), + allow( + clippy::needless_raw_string_hashes, + clippy::unreadable_literal, + clippy::unused_async, + ) +)] +// List of temporarily allowed lints to unblock beta/nightly. +#![allow( + unknown_lints, + // TODO: 1.82: Add `use` where necessary and remove from this list. + impl_trait_overcaptures, +)] -use std::convert::Infallible; +use std::{convert::Infallible, future::Future}; use anyhow::{bail, Context}; use intern::{EndpointIdInt, EndpointIdTag, InternId}; @@ -35,7 +117,14 @@ pub mod usage_metrics; pub mod waiters; /// Handle unix signals appropriately. -pub async fn handle_signals(token: CancellationToken) -> anyhow::Result { +pub async fn handle_signals( + token: CancellationToken, + mut refresh_config: F, +) -> anyhow::Result +where + F: FnMut() -> Fut, + Fut: Future>, +{ use tokio::signal::unix::{signal, SignalKind}; let mut hangup = signal(SignalKind::hangup())?; @@ -46,7 +135,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result { - warn!("received SIGHUP; config reload is not supported"); + warn!("received SIGHUP"); + refresh_config().await?; } // Shut down the whole application. _ = interrupt.recv() => { @@ -72,7 +162,8 @@ macro_rules! smol_str_wrapper { pub struct $name(smol_str::SmolStr); impl $name { - pub fn as_str(&self) -> &str { + #[allow(unused)] + pub(crate) fn as_str(&self) -> &str { self.0.as_str() } } @@ -167,19 +258,19 @@ smol_str_wrapper!(Host); // Endpoints are a bit tricky. Rare they might be branches or projects. impl EndpointId { - pub fn is_endpoint(&self) -> bool { + pub(crate) fn is_endpoint(&self) -> bool { self.0.starts_with("ep-") } - pub fn is_branch(&self) -> bool { + pub(crate) fn is_branch(&self) -> bool { self.0.starts_with("br-") } - pub fn is_project(&self) -> bool { - !self.is_endpoint() && !self.is_branch() - } - pub fn as_branch(&self) -> BranchId { + // pub(crate) fn is_project(&self) -> bool { + // !self.is_endpoint() && !self.is_branch() + // } + pub(crate) fn as_branch(&self) -> BranchId { BranchId(self.0.clone()) } - pub fn as_project(&self) -> ProjectId { + pub(crate) fn as_project(&self) -> ProjectId { ProjectId(self.0.clone()) } } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index ccef88231b..2da7eac580 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -4,8 +4,8 @@ use lasso::ThreadedRodeo; use measured::{ label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, - Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, - LabelGroup, MetricGroup, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; @@ -548,6 +548,7 @@ pub enum RedisEventsCount { } pub struct ThreadPoolWorkers(usize); +#[derive(Copy, Clone)] pub struct ThreadPoolWorkerId(pub usize); impl LabelValue for ThreadPoolWorkerId { @@ -613,9 +614,6 @@ impl FixedCardinalitySet for ThreadPoolWorkers { #[derive(MetricGroup)] #[metric(new(workers: usize))] pub struct ThreadPoolMetrics { - pub injector_queue_depth: Gauge, - #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))] - pub worker_queue_depth: GaugeVec, #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] pub worker_task_turns_total: CounterVec, #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 0d03574901..8c0f251066 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -2,14 +2,14 @@ use std::ffi::CStr; -pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { +pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { let cstr = CStr::from_bytes_until_nul(bytes).ok()?; let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len()); Some((cstr, other)) } /// See . -pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { +pub(crate) fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 1dd4563514..17764f78d1 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -13,9 +13,9 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; pin_project! { /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough - pub struct ChainRW { + pub(crate) struct ChainRW { #[pin] - pub inner: T, + pub(crate) inner: T, buf: BytesMut, } } @@ -60,7 +60,7 @@ const HEADER: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; -pub async fn read_proxy_protocol( +pub(crate) async fn read_proxy_protocol( mut read: T, ) -> std::io::Result<(ChainRW, Option)> { let mut buf = BytesMut::with_capacity(128); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 2182f38fe7..ff199ac701 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,12 +1,12 @@ #[cfg(test)] mod tests; -pub mod connect_compute; +pub(crate) mod connect_compute; mod copy_bidirectional; -pub mod handshake; -pub mod passthrough; -pub mod retry; -pub mod wake_compute; +pub(crate) mod handshake; +pub(crate) mod passthrough; +pub(crate) mod retry; +pub(crate) mod wake_compute; pub use copy_bidirectional::copy_bidirectional_client_compute; pub use copy_bidirectional::ErrorSource; @@ -170,21 +170,21 @@ pub async fn task_main( Ok(()) } -pub enum ClientMode { +pub(crate) enum ClientMode { Tcp, Websockets { hostname: Option }, } /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { - pub fn allow_cleartext(&self) -> bool { + pub(crate) fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, ClientMode::Websockets { .. } => true, } } - pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { + pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { match self { ClientMode::Tcp => config.allow_self_signed_compute, ClientMode::Websockets { .. } => false, @@ -213,7 +213,7 @@ impl ClientMode { // 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, // we cannot be sure the client even understands our error message // 3. PrepareClient: The client disconnected, so we can't tell them anyway... -pub enum ClientRequestError { +pub(crate) enum ClientRequestError { #[error("{0}")] Cancellation(#[from] cancellation::CancelError), #[error("{0}")] @@ -238,7 +238,7 @@ impl ReportableError for ClientRequestError { } } -pub async fn handle_client( +pub(crate) async fn handle_client( config: &'static ProxyConfig, ctx: &RequestMonitoring, cancellation_handler: Arc, @@ -254,7 +254,7 @@ pub async fn handle_client( let metrics = &Metrics::get().proxy; let proto = ctx.protocol(); - let _request_gauge = metrics.connection_requests.guard(proto); + let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); @@ -283,7 +283,7 @@ pub async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) + .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) .transpose(); let user_info = match result { @@ -340,9 +340,9 @@ pub async fn handle_client( client: stream, aux: node.aux.clone(), compute: node, - req: _request_gauge, - conn: conn_gauge, - cancel: session, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, })) } @@ -377,20 +377,20 @@ async fn prepare_client_connection

( } #[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct NeonOptions(Vec<(SmolStr, SmolStr)>); +pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { - pub fn parse_params(params: &StartupMessageParams) -> Self { + pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params .options_raw() .map(Self::parse_from_iter) .unwrap_or_default() } - pub fn parse_options_raw(options: &str) -> Self { + pub(crate) fn parse_options_raw(options: &str) -> Self { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } - pub fn is_ephemeral(&self) -> bool { + pub(crate) fn is_ephemeral(&self) -> bool { // Currently, neon endpoint options are all reserved for ephemeral endpoints. !self.0.is_empty() } @@ -404,7 +404,7 @@ impl NeonOptions { Self(options) } - pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { + pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { // prefix + format!(" {k}:{v}") // kinda jank because SmolStr is immutable std::iter::once(prefix) @@ -415,7 +415,7 @@ impl NeonOptions { /// DeepObject format /// `paramName[prop1]=value1¶mName[prop2]=value2&...` - pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { + pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { self.0 .iter() .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone())) @@ -423,7 +423,7 @@ impl NeonOptions { } } -pub fn neon_option(bytes: &str) -> Option<(&str, &str)> { +pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index f38e43ba5a..613548d4a0 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -25,14 +25,15 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { +pub(crate) fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); } - let label = match is_cached { - true => ConnectionFailureKind::ComputeCached, - false => ConnectionFailureKind::ComputeUncached, + let label = if is_cached { + ConnectionFailureKind::ComputeCached + } else { + ConnectionFailureKind::ComputeUncached }; Metrics::get().proxy.connection_failures_total.inc(label); @@ -40,7 +41,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { } #[async_trait] -pub trait ConnectMechanism { +pub(crate) trait ConnectMechanism { type Connection; type ConnectError: ReportableError; type Error: From; @@ -55,21 +56,21 @@ pub trait ConnectMechanism { } #[async_trait] -pub trait ComputeConnectBackend { +pub(crate) trait ComputeConnectBackend { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result; - fn get_keys(&self) -> Option<&ComputeCredentialKeys>; + fn get_keys(&self) -> &ComputeCredentialKeys; } -pub struct TcpMechanism<'a> { +pub(crate) struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. - pub params: &'a StartupMessageParams, + pub(crate) params: &'a StartupMessageParams, /// connect_to_compute concurrency lock - pub locks: &'static ApiLocks, + pub(crate) locks: &'static ApiLocks, } #[async_trait] @@ -97,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] -pub async fn connect_to_compute( +pub(crate) async fn connect_to_compute( ctx: &RequestMonitoring, mechanism: &M, user_info: &B, @@ -112,9 +113,8 @@ where let mut num_retries = 0; let mut node_info = wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; - if let Some(keys) = user_info.get_keys() { - node_info.set_keys(keys); - } + + node_info.set_keys(user_info.get_keys()); node_info.allow_self_signed_compute = allow_self_signed_compute; // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 048523f69c..4ebda013ac 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -14,7 +14,7 @@ enum TransferState { } #[derive(Debug)] -pub enum ErrorDirection { +pub(crate) enum ErrorDirection { Read(io::Error), Write(io::Error), } @@ -230,11 +230,10 @@ impl CopyBuffer { io::ErrorKind::WriteZero, "write zero byte into writer", )))); - } else { - self.pos += i; - self.amt += i as u64; - self.need_flush = true; } + self.pos += i; + self.amt += i as u64; + self.need_flush = true; } // If pos larger than cap, this loop will never stop. diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 27a72f8072..5996b11c11 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -18,7 +18,7 @@ use crate::{ }; #[derive(Error, Debug)] -pub enum HandshakeError { +pub(crate) enum HandshakeError { #[error("data is sent before server replied with EncryptionResponse")] EarlyData, @@ -57,7 +57,7 @@ impl ReportableError for HandshakeError { } } -pub enum HandshakeData { +pub(crate) enum HandshakeData { Startup(PqStream>, StartupMessageParams), Cancel(CancelKeyData), } @@ -67,7 +67,7 @@ pub enum HandshakeData { /// It's easier to work with owned `stream` here as we need to upgrade it to TLS; /// we also take an extra care of propagating only the select handshake errors to client. #[tracing::instrument(skip_all)] -pub async fn handshake( +pub(crate) async fn handshake( ctx: &RequestMonitoring, stream: S, mut tls: Option<&TlsConfig>, diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 9942fac383..c17108de0a 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -14,7 +14,7 @@ use super::copy_bidirectional::ErrorSource; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] -pub async fn proxy_pass( +pub(crate) async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, @@ -57,18 +57,18 @@ pub async fn proxy_pass( Ok(()) } -pub struct ProxyPassthrough { - pub client: Stream, - pub compute: PostgresConnection, - pub aux: MetricsAuxInfo, +pub(crate) struct ProxyPassthrough { + pub(crate) client: Stream, + pub(crate) compute: PostgresConnection, + pub(crate) aux: MetricsAuxInfo, - pub req: NumConnectionRequestsGuard<'static>, - pub conn: NumClientConnectionsGuard<'static>, - pub cancel: cancellation::Session

, + pub(crate) _req: NumConnectionRequestsGuard<'static>, + pub(crate) _conn: NumClientConnectionsGuard<'static>, + pub(crate) _cancel: cancellation::Session

, } impl ProxyPassthrough { - pub async fn proxy_pass(self) -> Result<(), ErrorSource> { + pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { tracing::error!(?err, "could not cancel the query in the database"); diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 644b183a91..15895d37e6 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -2,18 +2,18 @@ use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -pub trait CouldRetry { +pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; } -pub trait ShouldRetryWakeCompute { +pub(crate) trait ShouldRetryWakeCompute { /// Returns true if we need to invalidate the cache for this node. /// If false, we can continue retrying with the current node cache. fn should_retry_wake_compute(&self) -> bool; } -pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool { +pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool { num_retries < config.max_retries && err.could_retry() } @@ -101,7 +101,7 @@ impl ShouldRetryWakeCompute for compute::ConnectionError { } } -pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { +pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { config .base_delay .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index d8308c4f2a..752d982726 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -11,14 +11,14 @@ use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, }; use crate::config::{CertResolver, RetryConfig}; -use crate::console::caches::NodeInfoCache; use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status}; -use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; +use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend, NodeInfoCache}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; -use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; +use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; +use http::StatusCode; use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; use rustls::pki_types; @@ -268,7 +268,7 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> { anyhow::Ok(keepalive) }); - let _ = TcpStream::connect(("127.0.0.1", port)).await?; + TcpStream::connect(("127.0.0.1", port)).await?; assert!(t.await??, "keepalive should be inherited"); Ok(()) @@ -433,7 +433,7 @@ impl ReportableError for TestConnectError { impl std::fmt::Display for TestConnectError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } @@ -475,7 +475,7 @@ impl ConnectMechanism for TestConnectMechanism { retryable: false, kind: ErrorKind::Compute, }), - x => panic!("expecting action {:?}, connect is called instead", x), + x => panic!("expecting action {x:?}, connect is called instead"), } } @@ -491,7 +491,7 @@ impl TestBackend for TestConnectMechanism { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { let err = console::errors::ApiError::Console(ConsoleError { - http_status_code: http::StatusCode::BAD_REQUEST, + http_status_code: StatusCode::BAD_REQUEST, error: "TEST".into(), status: None, }); @@ -500,7 +500,7 @@ impl TestBackend for TestConnectMechanism { } ConnectAction::WakeRetry => { let err = console::errors::ApiError::Console(ConsoleError { - http_status_code: http::StatusCode::BAD_REQUEST, + http_status_code: StatusCode::BAD_REQUEST, error: "TEST".into(), status: Some(Status { code: "error".into(), @@ -515,7 +515,7 @@ impl TestBackend for TestConnectMechanism { assert!(err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } - x => panic!("expecting action {:?}, wake_compute is called instead", x), + x => panic!("expecting action {x:?}, wake_compute is called instead"), } } @@ -525,9 +525,6 @@ impl TestBackend for TestConnectMechanism { { unimplemented!("not used in tests") } - fn get_role_secret(&self) -> Result { - unimplemented!("not used in tests") - } } fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { @@ -547,8 +544,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::BackendType<'static, ComputeCredentials, &()> { - let user_info = auth::BackendType::Console( +) -> auth::Backend<'static, ComputeCredentials, &()> { + let user_info = auth::Backend::Console( MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))), ComputeCredentials { info: ComputeUserInfo { diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 2d752b9183..33a2162bc7 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -102,7 +102,7 @@ async fn proxy_mitm( } /// taken from tokio-postgres -pub async fn connect_tls(mut stream: S, tls: T) -> T::Stream +pub(crate) async fn connect_tls(mut stream: S, tls: T) -> T::Stream where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, @@ -115,9 +115,7 @@ where let mut buf = [0]; stream.read_exact(&mut buf).await.unwrap(); - if buf[0] != b'S' { - panic!("ssl not supported by server"); - } + assert!(buf[0] == b'S', "ssl not supported by server"); tls.connect(stream).await.unwrap() } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 5b06e8f054..9b8ac6d29d 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -12,7 +12,7 @@ use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; -pub async fn wake_compute( +pub(crate) async fn wake_compute( num_retries: &mut u32, ctx: &RequestMonitoring, api: &B, diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 222cd431d2..6e38f89458 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,10 +1,14 @@ +mod leaky_bucket; mod limit_algorithm; mod limiter; -pub use limit_algorithm::{ - aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, -}; -pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; -mod leaky_bucket; -pub use leaky_bucket::{ - EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState, + +#[cfg(test)] +pub(crate) use limit_algorithm::aimd::Aimd; + +pub(crate) use limit_algorithm::{ + DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; +pub(crate) use limiter::GlobalRateLimiter; + +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; +pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 2d5e056540..bf4d85f2e4 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -8,6 +8,7 @@ use dashmap::DashMap; use rand::{thread_rng, Rng}; use tokio::time::Instant; use tracing::info; +use utils::leaky_bucket::LeakyBucketState; use crate::intern::EndpointIdInt; @@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter; pub struct LeakyBucketRateLimiter { map: DashMap, - config: LeakyBucketConfig, + config: utils::leaky_bucket::LeakyBucketConfig, access_count: AtomicUsize, } @@ -29,25 +30,25 @@ impl LeakyBucketRateLimiter { pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { Self { map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), - config, + config: config.into(), access_count: AtomicUsize::new(0), } } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, key: K, n: u32) -> bool { + pub(crate) fn check(&self, key: K, n: u32) -> bool { let now = Instant::now(); if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { self.do_gc(now); } - let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState { - time: now, - filled: 0.0, - }); + let mut entry = self + .map + .entry(key) + .or_insert_with(|| LeakyBucketState { empty_at: now }); - entry.check(&self.config, now, n as f64) + entry.add_tokens(&self.config, now, n as f64).is_ok() } fn do_gc(&self, now: Instant) { @@ -59,7 +60,7 @@ impl LeakyBucketRateLimiter { let shard = thread_rng().gen_range(0..n); self.map.shards()[shard] .write() - .retain(|_, value| !value.get_mut().update(&self.config, now)); + .retain(|_, value| !value.get().bucket_is_empty(now)); } } @@ -68,103 +69,76 @@ pub struct LeakyBucketConfig { pub max: f64, } -pub struct LeakyBucketState { - filled: f64, - time: Instant, -} - +#[cfg(test)] impl LeakyBucketConfig { - pub fn new(rps: f64, max: f64) -> Self { + pub(crate) fn new(rps: f64, max: f64) -> Self { assert!(rps > 0.0, "rps must be positive"); assert!(max > 0.0, "max must be positive"); Self { rps, max } } } -impl LeakyBucketState { - pub fn new() -> Self { - Self { - filled: 0.0, - time: Instant::now(), - } - } - - /// updates the timer and returns true if the bucket is empty - fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool { - let drain = now.duration_since(self.time); - let drain = drain.as_secs_f64() * info.rps; - - self.filled = (self.filled - drain).clamp(0.0, info.max); - self.time = now; - - self.filled == 0.0 - } - - pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool { - self.update(info, now); - - if self.filled + n > info.max { - return false; - } - self.filled += n; - - true - } -} - -impl Default for LeakyBucketState { - fn default() -> Self { - Self::new() +impl From for utils::leaky_bucket::LeakyBucketConfig { + fn from(config: LeakyBucketConfig) -> Self { + utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max) } } #[cfg(test)] +#[allow(clippy::float_cmp)] mod tests { use std::time::Duration; use tokio::time::Instant; + use utils::leaky_bucket::LeakyBucketState; - use super::{LeakyBucketConfig, LeakyBucketState}; + use super::LeakyBucketConfig; #[tokio::test(start_paused = true)] async fn check() { - let info = LeakyBucketConfig::new(500.0, 2000.0); - let mut bucket = LeakyBucketState::new(); + let config: utils::leaky_bucket::LeakyBucketConfig = + LeakyBucketConfig::new(500.0, 2000.0).into(); + assert_eq!(config.cost, Duration::from_millis(2)); + assert_eq!(config.bucket_width, Duration::from_secs(4)); + + let mut bucket = LeakyBucketState { + empty_at: Instant::now(), + }; // should work for 2000 requests this second for _ in 0..2000 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); - assert_eq!(bucket.filled, 2000.0); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width); // in 1ms we should drain 0.5 tokens. // make sure we don't lose any tokens tokio::time::advance(Duration::from_millis(1)).await; - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); tokio::time::advance(Duration::from_millis(1)).await; - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); // in 10ms we should drain 5 tokens tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // in 10s we should drain 5000 tokens // but cap is only 2000 tokio::time::advance(Duration::from_secs(10)).await; for _ in 0..2000 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // should sustain 500rps for _ in 0..2000 { tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } } } diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 80a62b2a76..25607b7e10 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -8,13 +8,13 @@ use tokio::{ use self::aimd::Aimd; -pub mod aimd; +pub(crate) mod aimd; /// Whether a job succeeded or failed as a result of congestion/overload. /// /// Errors not considered to be caused by overload should be ignored. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { +pub(crate) enum Outcome { /// The job succeeded, or failed in a way unrelated to overload. Success, /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal @@ -23,14 +23,14 @@ pub enum Outcome { } /// An algorithm for controlling a concurrency limit. -pub trait LimitAlgorithm: Send + Sync + 'static { +pub(crate) trait LimitAlgorithm: Send + Sync + 'static { /// Update the concurrency limit in response to a new job completion. fn update(&self, old_limit: usize, sample: Sample) -> usize; } /// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay). #[derive(Debug, Clone, PartialEq, Eq, Copy)] -pub struct Sample { +pub(crate) struct Sample { pub(crate) latency: Duration, /// Jobs in flight when the sample was taken. pub(crate) in_flight: usize, @@ -39,7 +39,7 @@ pub struct Sample { #[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)] #[serde(rename_all = "snake_case")] -pub enum RateLimitAlgorithm { +pub(crate) enum RateLimitAlgorithm { #[default] Fixed, Aimd { @@ -48,7 +48,7 @@ pub enum RateLimitAlgorithm { }, } -pub struct Fixed; +pub(crate) struct Fixed; impl LimitAlgorithm for Fixed { fn update(&self, old_limit: usize, _sample: Sample) -> usize { @@ -59,12 +59,12 @@ impl LimitAlgorithm for Fixed { #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] pub struct RateLimiterConfig { #[serde(flatten)] - pub algorithm: RateLimitAlgorithm, - pub initial_limit: usize, + pub(crate) algorithm: RateLimitAlgorithm, + pub(crate) initial_limit: usize, } impl RateLimiterConfig { - pub fn create_rate_limit_algorithm(self) -> Box { + pub(crate) fn create_rate_limit_algorithm(self) -> Box { match self.algorithm { RateLimitAlgorithm::Fixed => Box::new(Fixed), RateLimitAlgorithm::Aimd { conf } => Box::new(conf), @@ -72,7 +72,7 @@ impl RateLimiterConfig { } } -pub struct LimiterInner { +pub(crate) struct LimiterInner { alg: Box, available: usize, limit: usize, @@ -114,7 +114,7 @@ impl LimiterInner { /// /// The limit will be automatically adjusted based on observed latency (delay) and/or failures /// caused by overload (loss). -pub struct DynamicLimiter { +pub(crate) struct DynamicLimiter { config: RateLimiterConfig, inner: Mutex, // to notify when a token is available @@ -124,7 +124,7 @@ pub struct DynamicLimiter { /// A concurrency token, required to run a job. /// /// Release the token back to the [`DynamicLimiter`] after the job is complete. -pub struct Token { +pub(crate) struct Token { start: Instant, limiter: Option>, } @@ -133,14 +133,14 @@ pub struct Token { /// /// Not guaranteed to be consistent under high concurrency. #[derive(Debug, Clone, Copy)] -pub struct LimiterState { +#[cfg(test)] +struct LimiterState { limit: usize, - in_flight: usize, } impl DynamicLimiter { /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Arc { + pub(crate) fn new(config: RateLimiterConfig) -> Arc { let ready = Notify::new(); ready.notify_one(); @@ -157,7 +157,10 @@ impl DynamicLimiter { } /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - pub async fn acquire_timeout(self: &Arc, duration: Duration) -> Result { + pub(crate) async fn acquire_timeout( + self: &Arc, + duration: Duration, + ) -> Result { tokio::time::timeout(duration, self.acquire()).await? } @@ -174,9 +177,8 @@ impl DynamicLimiter { let mut inner = self.inner.lock(); if inner.take(&self.ready).is_some() { break Ok(Token::new(self.clone())); - } else { - notified.set(self.ready.notified()); } + notified.set(self.ready.notified()); } notified.as_mut().await; ready = true; @@ -209,12 +211,10 @@ impl DynamicLimiter { } /// The current state of the limiter. - pub fn state(&self) -> LimiterState { + #[cfg(test)] + fn state(&self) -> LimiterState { let inner = self.inner.lock(); - LimiterState { - limit: inner.limit, - in_flight: inner.in_flight, - } + LimiterState { limit: inner.limit } } } @@ -225,22 +225,22 @@ impl Token { limiter: Some(limiter), } } - pub fn disabled() -> Self { + pub(crate) fn disabled() -> Self { Self { start: Instant::now(), limiter: None, } } - pub fn is_disabled(&self) -> bool { + pub(crate) fn is_disabled(&self) -> bool { self.limiter.is_none() } - pub fn release(mut self, outcome: Outcome) { + pub(crate) fn release(mut self, outcome: Outcome) { self.release_mut(Some(outcome)); } - pub fn release_mut(&mut self, outcome: Option) { + pub(crate) fn release_mut(&mut self, outcome: Option) { if let Some(limiter) = self.limiter.take() { limiter.release_inner(self.start, outcome); } @@ -253,13 +253,10 @@ impl Drop for Token { } } +#[cfg(test)] impl LimiterState { /// The current concurrency limit. - pub fn limit(&self) -> usize { + fn limit(self) -> usize { self.limit } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } } diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index d669492fa6..86b56e38fb 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -10,17 +10,17 @@ use super::{LimitAlgorithm, Outcome, Sample}; /// /// Reduces available concurrency by a factor when load-based errors are detected. #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] -pub struct Aimd { +pub(crate) struct Aimd { /// Minimum limit for AIMD algorithm. - pub min: usize, + pub(crate) min: usize, /// Maximum limit for AIMD algorithm. - pub max: usize, + pub(crate) max: usize, /// Decrease AIMD decrease by value in case of error. - pub dec: f32, + pub(crate) dec: f32, /// Increase AIMD increase by value in case of success. - pub inc: usize, + pub(crate) inc: usize, /// A threshold below which the limit won't be increased. - pub utilisation: f32, + pub(crate) utilisation: f32, } impl LimitAlgorithm for Aimd { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 5db4efed37..be529f174d 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -17,13 +17,13 @@ use tracing::info; use crate::intern::EndpointIdInt; -pub struct GlobalRateLimiter { +pub(crate) struct GlobalRateLimiter { data: Vec, info: Vec, } impl GlobalRateLimiter { - pub fn new(info: Vec) -> Self { + pub(crate) fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -37,7 +37,7 @@ impl GlobalRateLimiter { } /// Check that number of connections is below `max_rps` rps. - pub fn check(&mut self) -> bool { + pub(crate) fn check(&mut self) -> bool { let now = Instant::now(); let should_allow_request = self @@ -96,9 +96,9 @@ impl RateBucket { #[derive(Clone, Copy, PartialEq)] pub struct RateBucketInfo { - pub interval: Duration, + pub(crate) interval: Duration, // requests per interval - pub max_rpi: u32, + pub(crate) max_rpi: u32, } impl std::fmt::Display for RateBucketInfo { @@ -192,7 +192,7 @@ impl BucketRateLimiter { } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, key: K, n: u32) -> bool { + pub(crate) fn check(&self, key: K, n: u32) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) @@ -228,7 +228,7 @@ impl BucketRateLimiter { /// Clean the map. Simple strategy: remove all entries in a random shard. /// At worst, we'll double the effective max_rps during the cleanup. /// But that way deletion does not aquire mutex on each entry access. - pub fn do_gc(&self) { + pub(crate) fn do_gc(&self) { info!( "cleaning up bucket rate limiter, current size = {}", self.map.len() diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index c9a946fa4a..95bdfc0965 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -109,7 +109,7 @@ impl RedisPublisherClient { let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) } - pub async fn try_connect(&mut self) -> anyhow::Result<()> { + pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { match self.client.connect().await { Ok(()) => {} Err(e) => { diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index c78ee166f1..2de66b58b1 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -6,7 +6,7 @@ use redis::{ ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, }; use tokio::task::JoinHandle; -use tracing::{error, info}; +use tracing::{debug, error, info}; use super::elasticache::CredentialsProvider; @@ -81,7 +81,7 @@ impl ConnectionWithCredentialsProvider { redis::cmd("PING").query_async(con).await } - pub async fn connect(&mut self) -> anyhow::Result<()> { + pub(crate) async fn connect(&mut self) -> anyhow::Result<()> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { match Self::ping(con).await { @@ -109,7 +109,10 @@ impl ConnectionWithCredentialsProvider { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); let f = tokio::spawn(async move { - let _ = Self::keep_connection(con2, credentials_provider).await; + Self::keep_connection(con2, credentials_provider) + .await + .inspect_err(|e| debug!("keep_connection failed: {e}")) + .ok(); }); self.refresh_token_task = Some(f); } @@ -149,7 +152,7 @@ impl ConnectionWithCredentialsProvider { // PubSub does not support credentials refresh. // Requires manual reconnection every 12h. - pub async fn get_async_pubsub(&self) -> anyhow::Result { + pub(crate) async fn get_async_pubsub(&self) -> anyhow::Result { Ok(self.get_client().await?.get_async_pubsub().await?) } @@ -187,7 +190,10 @@ impl ConnectionWithCredentialsProvider { } /// Sends an already encoded (packed) command into the TCP socket and /// reads the single response from it. - pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult { + pub(crate) async fn send_packed_command( + &mut self, + cmd: &redis::Cmd, + ) -> RedisResult { // Clone connection to avoid having to lock the ArcSwap in write mode let con = self.con.as_mut().ok_or(redis::RedisError::from(( redis::ErrorKind::IoError, @@ -199,7 +205,7 @@ impl ConnectionWithCredentialsProvider { /// Sends multiple already encoded (packed) command into the TCP socket /// and reads `count` responses from it. This is used to implement /// pipelining. - pub async fn send_packed_commands( + pub(crate) async fn send_packed_commands( &mut self, cmd: &redis::Pipeline, offset: usize, diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index eded8250af..d118c8f412 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -51,7 +51,7 @@ impl CredentialsProvider { credentials_provider, } } - pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { let aws_credentials = self .credentials_provider .provide_credentials() diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index ad69246443..36a3443603 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -58,9 +58,9 @@ pub(crate) struct PasswordUpdate { } #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct CancelSession { - pub region_id: Option, - pub cancel_key_data: CancelKeyData, - pub session_id: Uuid, + pub(crate) region_id: Option, + pub(crate) cancel_key_data: CancelKeyData, + pub(crate) session_id: Uuid, } fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result @@ -89,7 +89,7 @@ impl Clone for MessageHandler { } impl MessageHandler { - pub fn new( + pub(crate) fn new( cache: Arc, cancellation_handler: Arc>, region_id: String, @@ -100,10 +100,10 @@ impl MessageHandler { region_id, } } - pub async fn increment_active_listeners(&self) { + pub(crate) async fn increment_active_listeners(&self) { self.cache.increment_active_listeners().await; } - pub async fn decrement_active_listeners(&self) { + pub(crate) async fn decrement_active_listeners(&self) { self.cache.decrement_active_listeners().await; } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] @@ -150,7 +150,7 @@ impl MessageHandler { } } } - _ => { + Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => { invalidate_cache(self.cache.clone(), msg.clone()); if matches!(msg, Notification::AllowedIpsUpdate { .. }) { Metrics::get() diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 60207fc824..0a36694359 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -14,13 +14,13 @@ use crate::error::{ReportableError, UserFacingError}; use std::io; use thiserror::Error; -pub use channel_binding::ChannelBinding; -pub use messages::FirstMessage; -pub use stream::{Outcome, SaslStream}; +pub(crate) use channel_binding::ChannelBinding; +pub(crate) use messages::FirstMessage; +pub(crate) use stream::{Outcome, SaslStream}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] -pub enum Error { +pub(crate) enum Error { #[error("Channel binding failed: {0}")] ChannelBindingFailed(&'static str), @@ -64,11 +64,11 @@ impl ReportableError for Error { } /// A convenient result type for SASL exchange. -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// A result of one SASL exchange. #[must_use] -pub enum Step { +pub(crate) enum Step { /// We should continue exchanging messages. Continue(T, String), /// The client has been authenticated successfully. @@ -78,7 +78,7 @@ pub enum Step { } /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. -pub trait Mechanism: Sized { +pub(crate) trait Mechanism: Sized { /// What's produced as a result of successful authentication. type Output; diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index 6e2d3057ce..fdd011448e 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -2,7 +2,7 @@ /// Channel binding flag (possibly with params). #[derive(Debug, PartialEq, Eq)] -pub enum ChannelBinding { +pub(crate) enum ChannelBinding { /// Client doesn't support channel binding. NotSupportedClient, /// Client thinks server doesn't support channel binding. @@ -12,7 +12,10 @@ pub enum ChannelBinding { } impl ChannelBinding { - pub fn and_then(self, f: impl FnOnce(T) -> Result) -> Result, E> { + pub(crate) fn and_then( + self, + f: impl FnOnce(T) -> Result, + ) -> Result, E> { Ok(match self { Self::NotSupportedClient => ChannelBinding::NotSupportedClient, Self::NotSupportedServer => ChannelBinding::NotSupportedServer, @@ -23,7 +26,7 @@ impl ChannelBinding { impl<'a> ChannelBinding<&'a str> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { + pub(crate) fn parse(input: &'a str) -> Option { Some(match input { "n" => Self::NotSupportedClient, "y" => Self::NotSupportedServer, @@ -34,7 +37,7 @@ impl<'a> ChannelBinding<&'a str> { impl ChannelBinding { /// Encode channel binding data as base64 for subsequent checks. - pub fn encode<'a, E>( + pub(crate) fn encode<'a, E>( &self, get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>, ) -> Result, E> { diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 2b5ae1785d..6c9a42b2db 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -5,16 +5,16 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] -pub struct FirstMessage<'a> { +pub(crate) struct FirstMessage<'a> { /// Authentication method, e.g. `"SCRAM-SHA-256"`. - pub method: &'a str, + pub(crate) method: &'a str, /// Initial client message. - pub message: &'a str, + pub(crate) message: &'a str, } impl<'a> FirstMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(bytes: &'a [u8]) -> Option { + pub(crate) fn parse(bytes: &'a [u8]) -> Option { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index 9115b0f61a..b6becd28e1 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -7,7 +7,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; /// Abstracts away all peculiarities of the libpq's protocol. -pub struct SaslStream<'a, S> { +pub(crate) struct SaslStream<'a, S> { /// The underlying stream. stream: &'a mut PqStream, /// Current password message we received from client. @@ -17,7 +17,7 @@ pub struct SaslStream<'a, S> { } impl<'a, S> SaslStream<'a, S> { - pub fn new(stream: &'a mut PqStream, first: &'a str) -> Self { + pub(crate) fn new(stream: &'a mut PqStream, first: &'a str) -> Self { Self { stream, current: bytes::Bytes::new(), @@ -53,7 +53,7 @@ impl SaslStream<'_, S> { /// It's much easier to match on those two variants /// than to peek into a noisy protocol error type. #[must_use = "caller must explicitly check for success"] -pub enum Outcome { +pub(crate) enum Outcome { /// Authentication succeeded and produced some value. Success(R), /// Authentication failed (reason attached). @@ -63,7 +63,7 @@ pub enum Outcome { impl SaslStream<'_, S> { /// Perform SASL message exchange according to the underlying algorithm /// until user is either authenticated or denied access. - pub async fn authenticate( + pub(crate) async fn authenticate( mut self, mut mechanism: M, ) -> super::Result> { diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 145e727a74..d058f1c3f8 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -15,9 +15,9 @@ mod secret; mod signature; pub mod threadpool; -pub use exchange::{exchange, Exchange}; -pub use key::ScramKey; -pub use secret::ServerSecret; +pub(crate) use exchange::{exchange, Exchange}; +pub(crate) use key::ScramKey; +pub(crate) use secret::ServerSecret; use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; @@ -26,8 +26,8 @@ const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; /// A list of supported SCRAM methods. -pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; -pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; +pub(crate) const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; +pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; /// Decode base64 into array without any heap allocations fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 944bb3c83e..64ee0135e1 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -2,7 +2,7 @@ use std::hash::Hash; /// estimator of hash jobs per second. /// -pub struct CountMinSketch { +pub(crate) struct CountMinSketch { // one for each depth hashers: Vec, width: usize, @@ -20,7 +20,7 @@ impl CountMinSketch { /// actual <= estimate /// estimate <= actual + ε * N with probability 1 - δ /// where N is the cardinality of the stream - pub fn with_params(epsilon: f64, delta: f64) -> Self { + pub(crate) fn with_params(epsilon: f64, delta: f64) -> Self { CountMinSketch::new( (std::f64::consts::E / epsilon).ceil() as usize, (1.0_f64 / delta).ln().ceil() as usize, @@ -49,7 +49,7 @@ impl CountMinSketch { } } - pub fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { + pub(crate) fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { let mut min = u32::MAX; for row in 0..self.depth { let col = (self.hashers[row].hash_one(t) as usize) % self.width; @@ -61,7 +61,7 @@ impl CountMinSketch { min } - pub fn reset(&mut self) { + pub(crate) fn reset(&mut self) { self.buckets.clear(); self.buckets.resize(self.width * self.depth, 0); } @@ -83,10 +83,10 @@ mod tests { let mut ids = vec![]; for _ in 0..n { - // number of insert operations - let n = rng.gen_range(1..100); // number to insert at once - let m = rng.gen_range(1..4096); + let n = rng.gen_range(1..4096); + // number of insert operations + let m = rng.gen_range(1..100); let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); ids.push((id, n, m)); @@ -102,17 +102,11 @@ mod tests { let mut ids2 = ids.clone(); while !ids2.is_empty() { ids2.shuffle(&mut rng); - - let mut i = 0; - while i < ids2.len() { - sketch.inc_and_return(&ids2[i].0, ids2[i].1); - ids2[i].2 -= 1; - if ids2[i].2 == 0 { - ids2.remove(i); - } else { - i += 1; - } - } + ids2.retain_mut(|id| { + sketch.inc_and_return(&id.0, id.1); + id.2 -= 1; + id.2 > 0 + }); } let mut within_p = 0; @@ -144,8 +138,8 @@ mod tests { // probably numbers are too small to truly represent the probabilities. assert_eq!(eval_precision(100, 4096.0, 0.90), 100); assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); - assert_eq!(eval_precision(100, 4096.0, 0.1), 98); - assert_eq!(eval_precision(1000, 4096.0, 0.1), 991); + assert_eq!(eval_precision(100, 4096.0, 0.1), 96); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 988); } // returns memory usage in bytes, and the time complexity per insert. diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index f2494379a5..afb5604666 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -56,14 +56,14 @@ enum ExchangeState { } /// Server's side of SCRAM auth algorithm. -pub struct Exchange<'a> { +pub(crate) struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, tls_server_end_point: config::TlsServerEndPoint, } impl<'a> Exchange<'a> { - pub fn new( + pub(crate) fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], tls_server_end_point: config::TlsServerEndPoint, @@ -86,8 +86,7 @@ async fn derive_client_key( ) -> ScramKey { let salted_password = pool .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) - .await - .expect("job should not be cancelled"); + .await; let make_key = |name| { let key = Hmac::::new_from_slice(&salted_password) @@ -101,7 +100,7 @@ async fn derive_client_key( make_key(b"Client Key").into() } -pub async fn exchange( +pub(crate) async fn exchange( pool: &ThreadPool, endpoint: EndpointIdInt, secret: &ServerSecret, @@ -218,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> { self.state = ExchangeState::SaltSent(sent); Ok(Step::Continue(self, msg)) } + #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match Step::Success(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } @@ -225,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> { ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), + #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match Step::Continue(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 32a3dbd203..fe55ff493b 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -3,14 +3,14 @@ use subtle::ConstantTimeEq; /// Faithfully taken from PostgreSQL. -pub const SCRAM_KEY_LEN: usize = 32; +pub(crate) const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. #[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] -pub struct ScramKey { +pub(crate) struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } @@ -27,11 +27,11 @@ impl ConstantTimeEq for ScramKey { } impl ScramKey { - pub fn sha256(&self) -> Self { + pub(crate) fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() } - pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { + pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { self.bytes } } diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 5ecbbf7004..fd9e77764c 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -8,7 +8,7 @@ use std::fmt; use std::ops::Range; /// Faithfully taken from PostgreSQL. -pub const SCRAM_RAW_NONCE_LEN: usize = 18; +pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; /// Although we ignore all extensions, we still have to validate the message. fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { @@ -27,18 +27,18 @@ fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option< } #[derive(Debug)] -pub struct ClientFirstMessage<'a> { +pub(crate) struct ClientFirstMessage<'a> { /// `client-first-message-bare`. - pub bare: &'a str, + pub(crate) bare: &'a str, /// Channel binding mode. - pub cbind_flag: ChannelBinding<&'a str>, + pub(crate) cbind_flag: ChannelBinding<&'a str>, /// Client nonce. - pub nonce: &'a str, + pub(crate) nonce: &'a str, } impl<'a> ClientFirstMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { + pub(crate) fn parse(input: &'a str) -> Option { let mut parts = input.split(','); let cbind_flag = ChannelBinding::parse(parts.next()?)?; @@ -77,7 +77,7 @@ impl<'a> ClientFirstMessage<'a> { } /// Build a response to [`ClientFirstMessage`]. - pub fn build_server_first_message( + pub(crate) fn build_server_first_message( &self, nonce: &[u8; SCRAM_RAW_NONCE_LEN], salt_base64: &str, @@ -89,7 +89,7 @@ impl<'a> ClientFirstMessage<'a> { write!(&mut message, "r={}", self.nonce).unwrap(); base64::encode_config_buf(nonce, base64::STANDARD, &mut message); let combined_nonce = 2..message.len(); - write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap(); + write!(&mut message, ",s={salt_base64},i={iterations}").unwrap(); // This design guarantees that it's impossible to create a // server-first-message without receiving a client-first-message @@ -101,20 +101,20 @@ impl<'a> ClientFirstMessage<'a> { } #[derive(Debug)] -pub struct ClientFinalMessage<'a> { +pub(crate) struct ClientFinalMessage<'a> { /// `client-final-message-without-proof`. - pub without_proof: &'a str, + pub(crate) without_proof: &'a str, /// Channel binding data (base64). - pub channel_binding: &'a str, + pub(crate) channel_binding: &'a str, /// Combined client & server nonce. - pub nonce: &'a str, + pub(crate) nonce: &'a str, /// Client auth proof. - pub proof: [u8; SCRAM_KEY_LEN], + pub(crate) proof: [u8; SCRAM_KEY_LEN], } impl<'a> ClientFinalMessage<'a> { // NB: FromStr doesn't work with lifetimes - pub fn parse(input: &'a str) -> Option { + pub(crate) fn parse(input: &'a str) -> Option { let (without_proof, proof) = input.rsplit_once(',')?; let mut parts = without_proof.split(','); @@ -135,7 +135,7 @@ impl<'a> ClientFinalMessage<'a> { } /// Build a response to [`ClientFinalMessage`]. - pub fn build_server_final_message( + pub(crate) fn build_server_final_message( &self, signature_builder: SignatureBuilder<'_>, server_key: &ScramKey, @@ -153,7 +153,7 @@ impl<'a> ClientFinalMessage<'a> { /// We need to keep a convenient representation of this /// message for the next authentication step. -pub struct OwnedServerFirstMessage { +pub(crate) struct OwnedServerFirstMessage { /// Owned `server-first-message`. message: String, /// Slice into `message`. @@ -163,13 +163,13 @@ pub struct OwnedServerFirstMessage { impl OwnedServerFirstMessage { /// Extract combined nonce from the message. #[inline(always)] - pub fn nonce(&self) -> &str { + pub(crate) fn nonce(&self) -> &str { &self.message[self.nonce.clone()] } /// Get reference to a text representation of the message. #[inline(always)] - pub fn as_str(&self) -> &str { + pub(crate) fn as_str(&self) -> &str { &self.message } } diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index f690cc7738..4cf76c8452 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -4,7 +4,7 @@ use hmac::{ }; use sha2::Sha256; -pub struct Pbkdf2 { +pub(crate) struct Pbkdf2 { hmac: Hmac, prev: GenericArray, hi: GenericArray, @@ -13,7 +13,7 @@ pub struct Pbkdf2 { // inspired from impl Pbkdf2 { - pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { + pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { let hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); @@ -33,11 +33,11 @@ impl Pbkdf2 { } } - pub fn cost(&self) -> u32 { + pub(crate) fn cost(&self) -> u32 { (self.iterations).clamp(0, 4096) } - pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> { + pub(crate) fn turn(&mut self) -> std::task::Poll<[u8; 32]> { let Self { hmac, prev, @@ -75,7 +75,7 @@ mod tests { let salt = b"sodium chloride"; let pass = b"Ne0n_!5_50_C007"; - let mut job = Pbkdf2::start(pass, salt, 600000); + let mut job = Pbkdf2::start(pass, salt, 60000); let hash = loop { let std::task::Poll::Ready(hash) = job.turn() else { continue; @@ -83,7 +83,7 @@ mod tests { break hash; }; - let expected = pbkdf2_hmac_array::(pass, salt, 600000); + let expected = pbkdf2_hmac_array::(pass, salt, 60000); assert_eq!(hash, expected); } } diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 44c4f9e44a..8c6a08d432 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -8,22 +8,22 @@ use super::key::ScramKey; /// Server secret is produced from user's password, /// and is used throughout the authentication process. #[derive(Clone, Eq, PartialEq, Debug)] -pub struct ServerSecret { +pub(crate) struct ServerSecret { /// Number of iterations for `PBKDF2` function. - pub iterations: u32, + pub(crate) iterations: u32, /// Salt used to hash user's password. - pub salt_base64: String, + pub(crate) salt_base64: String, /// Hashed `ClientKey`. - pub stored_key: ScramKey, + pub(crate) stored_key: ScramKey, /// Used by client to verify server's signature. - pub server_key: ScramKey, + pub(crate) server_key: ScramKey, /// Should auth fail no matter what? /// This is exactly the case for mocked secrets. - pub doomed: bool, + pub(crate) doomed: bool, } impl ServerSecret { - pub fn parse(input: &str) -> Option { + pub(crate) fn parse(input: &str) -> Option { // SCRAM-SHA-256$:$: let s = input.strip_prefix("SCRAM-SHA-256$")?; let (params, keys) = s.split_once('$')?; @@ -42,7 +42,7 @@ impl ServerSecret { Some(secret) } - pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { + pub(crate) fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { // constant time to not leak partial key match client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) } @@ -50,7 +50,7 @@ impl ServerSecret { /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. - pub fn mock(nonce: [u8; 32]) -> Self { + pub(crate) fn mock(nonce: [u8; 32]) -> Self { Self { // this doesn't reveal much information as we're going to use // iteration count 1 for our generated passwords going forward. @@ -66,7 +66,7 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub async fn build(password: &str) -> Option { + pub(crate) async fn build(password: &str) -> Option { Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } @@ -82,13 +82,7 @@ mod tests { let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns="; let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI="; - let secret = format!( - "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}", - iterations = iterations, - salt = salt, - stored_key = stored_key, - server_key = server_key, - ); + let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}"); let parsed = ServerSecret::parse(&secret).unwrap(); assert_eq!(parsed.iterations, iterations); diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs index 1c2811d757..d3255cf2ca 100644 --- a/proxy/src/scram/signature.rs +++ b/proxy/src/scram/signature.rs @@ -4,14 +4,14 @@ use super::key::{ScramKey, SCRAM_KEY_LEN}; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] -pub struct SignatureBuilder<'a> { - pub client_first_message_bare: &'a str, - pub server_first_message: &'a str, - pub client_final_message_without_proof: &'a str, +pub(crate) struct SignatureBuilder<'a> { + pub(crate) client_first_message_bare: &'a str, + pub(crate) server_first_message: &'a str, + pub(crate) client_final_message_without_proof: &'a str, } impl SignatureBuilder<'_> { - pub fn build(&self, key: &ScramKey) -> Signature { + pub(crate) fn build(&self, key: &ScramKey) -> Signature { let parts = [ self.client_first_message_bare.as_bytes(), b",", @@ -28,13 +28,13 @@ impl SignatureBuilder<'_> { /// produces `ClientKey` that we need for authentication. #[derive(Debug)] #[repr(transparent)] -pub struct Signature { +pub(crate) struct Signature { bytes: [u8; SCRAM_KEY_LEN], } impl Signature { /// Derive `ClientKey` from client's signature and proof. - pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { + pub(crate) fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { // This is how the proof is calculated: // // 1. sha256(ClientKey) -> StoredKey diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index fa3d3ccca2..2702aeebfe 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,17 +4,20 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::sync::{ - atomic::{AtomicU64, Ordering}, - Arc, +use std::{ + cell::RefCell, + future::Future, + pin::Pin, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Weak, + }, + task::{Context, Poll}, }; -use crossbeam_deque::{Injector, Stealer, Worker}; -use itertools::Itertools; -use parking_lot::{Condvar, Mutex}; +use futures::FutureExt; use rand::Rng; use rand::{rngs::SmallRng, SeedableRng}; -use tokio::sync::oneshot; use crate::{ intern::EndpointIdInt, @@ -25,274 +28,164 @@ use crate::{ use super::pbkdf2::Pbkdf2; pub struct ThreadPool { - queue: Injector, - stealers: Vec>, - parkers: Vec<(Condvar, Mutex)>, - /// bitpacked representation. - /// lower 8 bits = number of sleeping threads - /// next 8 bits = number of idle threads (searching for work) - counters: AtomicU64, - + runtime: Option, pub metrics: Arc, } -#[derive(PartialEq)] -enum ThreadState { - Parked, - Active, +/// How often to reset the sketch values +const SKETCH_RESET_INTERVAL: u64 = 1021; + +thread_local! { + static STATE: RefCell> = const { RefCell::new(None) }; } impl ThreadPool { pub fn new(n_workers: u8) -> Arc { - let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec(); - let stealers = workers.iter().map(|w| w.stealer()).collect_vec(); + // rayon would be nice here, but yielding in rayon does not work well afaict. - let parkers = (0..n_workers) - .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active))) - .collect_vec(); + Arc::new_cyclic(|pool| { + let pool = pool.clone(); + let worker_id = AtomicUsize::new(0); - let pool = Arc::new(Self { - queue: Injector::new(), - stealers, - parkers, - // threads start searching for work - counters: AtomicU64::new((n_workers as u64) << 8), - metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), - }); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(n_workers as usize) + .on_thread_start(move || { + STATE.with_borrow_mut(|state| { + *state = Some(ThreadRt { + pool: pool.clone(), + id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)), + rng: SmallRng::from_entropy(), + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + countmin: CountMinSketch::with_params( + 1.0 / (SKETCH_RESET_INTERVAL as f64), + 0.01, + ), + tick: 0, + }); + }); + }) + .build() + .unwrap(); - for (i, worker) in workers.into_iter().enumerate() { - let pool = Arc::clone(&pool); - std::thread::spawn(move || thread_rt(pool, worker, i)); - } - - pool + Self { + runtime: Some(runtime), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + } + }) } - pub fn spawn_job( - &self, - endpoint: EndpointIdInt, - pbkdf2: Pbkdf2, - ) -> oneshot::Receiver<[u8; 32]> { - let (tx, rx) = oneshot::channel(); - - let queue_was_empty = self.queue.is_empty(); - - self.metrics.injector_queue_depth.inc(); - self.queue.push(JobSpec { - response: tx, - pbkdf2, - endpoint, - }); - - // inspired from - let counts = self.counters.load(Ordering::SeqCst); - let num_awake_but_idle = (counts >> 8) & 0xff; - let num_sleepers = counts & 0xff; - - // If the queue is non-empty, then we always wake up a worker - // -- clearly the existing idle jobs aren't enough. Otherwise, - // check to see if we have enough idle workers. - if !queue_was_empty || num_awake_but_idle == 0 { - let num_to_wake = Ord::min(1, num_sleepers); - self.wake_any_threads(num_to_wake); - } - - rx - } - - #[cold] - fn wake_any_threads(&self, mut num_to_wake: u64) { - if num_to_wake > 0 { - for i in 0..self.parkers.len() { - if self.wake_specific_thread(i) { - num_to_wake -= 1; - if num_to_wake == 0 { - return; - } - } - } - } - } - - fn wake_specific_thread(&self, index: usize) -> bool { - let (condvar, lock) = &self.parkers[index]; - - let mut state = lock.lock(); - if *state == ThreadState::Parked { - condvar.notify_one(); - - // When the thread went to sleep, it will have incremented - // this value. When we wake it, its our job to decrement - // it. We could have the thread do it, but that would - // introduce a delay between when the thread was - // *notified* and when this counter was decremented. That - // might mislead people with new work into thinking that - // there are sleeping threads that they should try to - // wake, when in fact there is nothing left for them to - // do. - self.counters.fetch_sub(1, Ordering::SeqCst); - *state = ThreadState::Active; - - true - } else { - false - } - } - - fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker) -> Option { - // announce thread as idle - self.counters.fetch_add(256, Ordering::SeqCst); - - // try steal from the global queue - loop { - match self.queue.steal_batch_and_pop(worker) { - crossbeam_deque::Steal::Success(job) => { - self.metrics - .injector_queue_depth - .set(self.queue.len() as i64); - // no longer idle - self.counters.fetch_sub(256, Ordering::SeqCst); - return Some(job); - } - crossbeam_deque::Steal::Retry => continue, - crossbeam_deque::Steal::Empty => break, - } - } - - // try steal from our neighbours - loop { - let mut retry = false; - let start = rng.gen_range(0..self.stealers.len()); - let job = (start..self.stealers.len()) - .chain(0..start) - .filter(|i| *i != skip) - .find_map( - |victim| match self.stealers[victim].steal_batch_and_pop(worker) { - crossbeam_deque::Steal::Success(job) => Some(job), - crossbeam_deque::Steal::Empty => None, - crossbeam_deque::Steal::Retry => { - retry = true; - None - } - }, - ); - if job.is_some() { - // no longer idle - self.counters.fetch_sub(256, Ordering::SeqCst); - return job; - } - if !retry { - return None; - } - } + pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle { + JobHandle( + self.runtime + .as_ref() + .unwrap() + .spawn(JobSpec { pbkdf2, endpoint }), + ) } } -fn thread_rt(pool: Arc, worker: Worker, index: usize) { - /// interval when we should steal from the global queue - /// so that tail latencies are managed appropriately - const STEAL_INTERVAL: usize = 61; +impl Drop for ThreadPool { + fn drop(&mut self) { + self.runtime.take().unwrap().shutdown_background(); + } +} - /// How often to reset the sketch values - const SKETCH_RESET_INTERVAL: usize = 1021; +struct ThreadRt { + pool: Weak, + id: ThreadPoolWorkerId, + rng: SmallRng, + countmin: CountMinSketch, + tick: u64, +} - let mut rng = SmallRng::from_entropy(); +impl ThreadRt { + fn should_run(&mut self, job: &JobSpec) -> bool { + let rate = self + .countmin + .inc_and_return(&job.endpoint, job.pbkdf2.cost()); - // used to determine whether we should temporarily skip tasks for fairness. - // 99% of estimates will overcount by no more than 4096 samples - let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01); - - let (condvar, lock) = &pool.parkers[index]; - - 'wait: loop { - // wait for notification of work - { - let mut lock = lock.lock(); - - // queue is empty - pool.metrics - .worker_queue_depth - .set(ThreadPoolWorkerId(index), 0); - - // subtract 1 from idle count, add 1 to sleeping count. - pool.counters.fetch_sub(255, Ordering::SeqCst); - - *lock = ThreadState::Parked; - condvar.wait(&mut lock); - } - - for i in 0.. { - let mut job = match worker - .pop() - .or_else(|| pool.steal(&mut rng, index, &worker)) - { - Some(job) => job, - None => continue 'wait, - }; - - pool.metrics - .worker_queue_depth - .set(ThreadPoolWorkerId(index), worker.len() as i64); - - // receiver is closed, cancel the task - if !job.response.is_closed() { - let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost()); - - const P: f64 = 2000.0; - // probability decreases as rate increases. - // lower probability, higher chance of being skipped - // - // estimates (rate in terms of 4096 rounds): - // rate = 0 => probability = 100% - // rate = 10 => probability = 71.3% - // rate = 50 => probability = 62.1% - // rate = 500 => probability = 52.3% - // rate = 1021 => probability = 49.8% - // - // My expectation is that the pool queue will only begin backing up at ~1000rps - // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above - // are in requests per second. - let probability = P.ln() / (P + rate as f64).ln(); - if pool.queue.len() > 32 || rng.gen_bool(probability) { - pool.metrics - .worker_task_turns_total - .inc(ThreadPoolWorkerId(index)); - - match job.pbkdf2.turn() { - std::task::Poll::Ready(result) => { - let _ = job.response.send(result); - } - std::task::Poll::Pending => worker.push(job), - } - } else { - pool.metrics - .worker_task_skips_total - .inc(ThreadPoolWorkerId(index)); - - // skip for now - worker.push(job); - } - } - - // if we get stuck with a few long lived jobs in the queue - // it's better to try and steal from the queue too for fairness - if i % STEAL_INTERVAL == 0 { - let _ = pool.queue.steal_batch(&worker); - } - - if i % SKETCH_RESET_INTERVAL == 0 { - sketch.reset(); - } - } + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + self.rng.gen_bool(probability) } } struct JobSpec { - response: oneshot::Sender<[u8; 32]>, pbkdf2: Pbkdf2, endpoint: EndpointIdInt, } +impl Future for JobSpec { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + STATE.with_borrow_mut(|state| { + let state = state.as_mut().expect("should be set on thread startup"); + + state.tick = state.tick.wrapping_add(1); + if state.tick % SKETCH_RESET_INTERVAL == 0 { + state.countmin.reset(); + } + + if state.should_run(&self) { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_turns_total.inc(state.id); + } + + match self.pbkdf2.turn() { + Poll::Ready(result) => Poll::Ready(result), + // more to do, we shall requeue + Poll::Pending => { + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } else { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_skips_total.inc(state.id); + } + + cx.waker().wake_by_ref(); + Poll::Pending + } + }) + } +} + +pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>); + +impl Future for JobHandle { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + match self.0.poll_unpin(cx) { + Poll::Ready(Ok(ok)) => Poll::Ready(ok), + Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()), + Poll::Pending => Poll::Pending, + } + } +} + +impl Drop for JobHandle { + fn drop(&mut self) { + self.0.abort(); + } +} + #[cfg(test)] mod tests { use crate::EndpointId; @@ -309,8 +202,7 @@ mod tests { let salt = [0x55; 32]; let actual = pool .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) - .await - .unwrap(); + .await; let expected = [ 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index b2bf93dc6d..84f98cb8ad 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -25,8 +25,6 @@ use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; -pub use reqwest_middleware::{ClientWithMiddleware, Error}; -pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; @@ -50,7 +48,7 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; use utils::http::error::ApiError; -pub const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, @@ -93,11 +91,11 @@ pub async fn task_main( let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); // prefer http2, but support http/1.1 tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; - Arc::new(tls_server_config) as Arc<_> + Arc::new(tls_server_config) } None => { warn!("TLS config is missing"); - Arc::new(NoTls) as Arc<_> + Arc::new(NoTls) } }; @@ -178,9 +176,9 @@ pub async fn task_main( Ok(()) } -pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {} +pub(crate) trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {} impl AsyncReadWrite for T {} -pub type AsyncRW = Pin>; +pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { @@ -407,7 +405,7 @@ async fn request_handler( .header("Access-Control-Allow-Origin", "*") .header( "Access-Control-Allow-Headers", - "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", + "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level", ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 295ea1a1c7..d163878528 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -4,7 +4,10 @@ use async_trait::async_trait; use tracing::{field::display, info}; use crate::{ - auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, + auth::{ + backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, + check_peer_addr_is_in_list, AuthError, + }, compute, config::{AuthenticationConfig, ProxyConfig}, console::{ @@ -24,30 +27,37 @@ use crate::{ Host, }; -use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool}; -pub struct PoolingBackend { - pub pool: Arc>, - pub config: &'static ProxyConfig, - pub endpoint_rate_limiter: Arc, +pub(crate) struct PoolingBackend { + pub(crate) pool: Arc>, + pub(crate) config: &'static ProxyConfig, + pub(crate) endpoint_rate_limiter: Arc, } impl PoolingBackend { - pub async fn authenticate( + pub(crate) async fn authenticate_with_password( &self, ctx: &RequestMonitoring, config: &AuthenticationConfig, - conn_info: &ConnInfo, + user_info: &ComputeUserInfo, + password: &[u8], ) -> Result { - let user_info = conn_info.user_info.clone(); - let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); + let user_info = user_info.clone(); + let backend = self + .config + .auth_backend + .as_ref() + .map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self .endpoint_rate_limiter - .check(conn_info.user_info.endpoint.clone().into(), 1) + .check(user_info.endpoint.clone().into(), 1) { return Err(AuthError::too_many_connections()); } @@ -70,14 +80,10 @@ impl PoolingBackend { return Err(AuthError::auth_failed(&*user_info.user)); } }; - let ep = EndpointIdInt::from(&conn_info.user_info.endpoint); - let auth_outcome = crate::auth::validate_password_and_exchange( - &config.thread_pool, - ep, - &conn_info.password, - secret, - ) - .await?; + let ep = EndpointIdInt::from(&user_info.endpoint); + let auth_outcome = + crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret) + .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); @@ -85,7 +91,7 @@ impl PoolingBackend { } crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - Err(AuthError::auth_failed(&*conn_info.user_info.user)) + Err(AuthError::auth_failed(&*user_info.user)) } }; res.map(|key| ComputeCredentials { @@ -94,23 +100,56 @@ impl PoolingBackend { }) } + pub(crate) async fn authenticate_with_jwt( + &self, + ctx: &RequestMonitoring, + user_info: &ComputeUserInfo, + jwt: &str, + ) -> Result { + match &self.config.auth_backend { + crate::auth::Backend::Console(_, ()) => { + Err(AuthError::auth_failed("JWT login is not yet supported")) + } + crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed( + "JWT login over web auth proxy is not supported", + )), + crate::auth::Backend::Local(cache) => { + cache + .jwks_cache + .check_jwt( + ctx, + user_info.endpoint.clone(), + user_info.user.clone(), + &StaticAuthRules, + jwt, + ) + .await + .map_err(|e| AuthError::auth_failed(e.to_string()))?; + Ok(ComputeCredentials { + info: user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + }) + } + } + } + // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] - pub async fn connect_to_compute( + pub(crate) async fn connect_to_compute( &self, ctx: &RequestMonitoring, conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, ) -> Result, HttpConnError> { - let maybe_client = if !force_new { - info!("pool: looking for an existing connection"); - self.pool.get(ctx, &conn_info)? - } else { + let maybe_client = if force_new { info!("pool: pool is disabled"); None + } else { + info!("pool: looking for an existing connection"); + self.pool.get(ctx, &conn_info)? }; if let Some(client) = maybe_client { @@ -119,7 +158,7 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self.config.auth_backend.as_ref().map(|_| keys); + let backend = self.config.auth_backend.as_ref().map(|()| keys); crate::proxy::connect_compute::connect_to_compute( ctx, &TokioMechanism { @@ -138,7 +177,7 @@ impl PoolingBackend { } #[derive(Debug, thiserror::Error)] -pub enum HttpConnError { +pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), #[error("could not connection to compute")] @@ -232,10 +271,16 @@ impl ConnectMechanism for TokioMechanism { let mut config = (*node_info.config).clone(); let config = config .user(&self.conn_info.user_info.user) - .password(&*self.conn_info.password) .dbname(&self.conn_info.dbname) .connect_timeout(timeout); + match &self.conn_info.auth { + AuthData::Jwt(_) => {} + AuthData::Password(pw) => { + config.password(pw); + } + } + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(tokio_postgres::NoTls).await; drop(pause); diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 390df7f4f7..7659745473 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -22,7 +22,7 @@ pub struct CancelSet { hasher: Hasher, } -pub struct CancelShard { +pub(crate) struct CancelShard { tokens: IndexMap, } @@ -40,7 +40,7 @@ impl CancelSet { } } - pub fn take(&self) -> Option { + pub(crate) fn take(&self) -> Option { for _ in 0..4 { if let Some(token) = self.take_raw(thread_rng().gen()) { return Some(token); @@ -50,12 +50,12 @@ impl CancelSet { None } - pub fn take_raw(&self, rng: usize) -> Option { + pub(crate) fn take_raw(&self, rng: usize) -> Option { NonZeroUsize::new(self.shards.len()) .and_then(|len| self.shards[rng % len].lock().take(rng / len)) } - pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { + pub(crate) fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { let shard = NonZeroUsize::new(self.shards.len()).map(|len| { let hash = self.hasher.hash_one(id) as usize; let shard = &self.shards[hash % len]; @@ -88,7 +88,7 @@ impl CancelShard { } } -pub struct CancelGuard<'a> { +pub(crate) struct CancelGuard<'a> { shard: Option<&'a Mutex>, id: Uuid, } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 3478787995..6c32d5df0e 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -30,19 +30,25 @@ use tracing::{info, info_span, Instrument}; use super::backend::HttpConnError; #[derive(Debug, Clone)] -pub struct ConnInfo { - pub user_info: ComputeUserInfo, - pub dbname: DbName, - pub password: SmallVec<[u8; 16]>, +pub(crate) struct ConnInfo { + pub(crate) user_info: ComputeUserInfo, + pub(crate) dbname: DbName, + pub(crate) auth: AuthData, +} + +#[derive(Debug, Clone)] +pub(crate) enum AuthData { + Password(SmallVec<[u8; 16]>), + Jwt(String), } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (DbName, RoleName) { + pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { (self.dbname.clone(), self.user_info.user.clone()) } - pub fn endpoint_cache_key(&self) -> Option { + pub(crate) fn endpoint_cache_key(&self) -> Option { // We don't want to cache http connections for ephemeral endpoints. if self.user_info.options.is_ephemeral() { None @@ -73,7 +79,7 @@ struct ConnPoolEntry { // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub struct EndpointConnPool { +pub(crate) struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, @@ -192,7 +198,7 @@ impl Drop for EndpointConnPool { } } -pub struct DbUserConnPool { +pub(crate) struct DbUserConnPool { conns: Vec>, } @@ -235,7 +241,7 @@ impl DbUserConnPool { } } -pub struct GlobalConnPool { +pub(crate) struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint @@ -276,7 +282,7 @@ pub struct GlobalConnPoolOptions { } impl GlobalConnPool { - pub fn new(config: &'static crate::config::HttpConfig) -> Arc { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { global_pool: DashMap::with_shard_amount(shards), @@ -287,21 +293,21 @@ impl GlobalConnPool { } #[cfg(test)] - pub fn get_global_connections_count(&self) -> usize { + pub(crate) fn get_global_connections_count(&self) -> usize { self.global_connections_count .load(atomic::Ordering::Relaxed) } - pub fn get_idle_timeout(&self) -> Duration { + pub(crate) fn get_idle_timeout(&self) -> Duration { self.config.pool_options.idle_timeout } - pub fn shutdown(&self) { + pub(crate) fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); } - pub async fn gc_worker(&self, mut rng: impl Rng) { + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { let epoch = self.config.pool_options.gc_epoch; let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); loop { @@ -333,9 +339,9 @@ impl GlobalConnPool { } = pool.get_mut(); // ensure that closed clients are removed - pools.iter_mut().for_each(|(_, db_pool)| { + for db_pool in pools.values_mut() { clients_removed += db_pool.clear_closed_clients(total_conns); - }); + } // we only remove this pool if it has no active connections if *total_conns == 0 { @@ -375,7 +381,7 @@ impl GlobalConnPool { } } - pub fn get( + pub(crate) fn get( self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, @@ -399,21 +405,20 @@ impl GlobalConnPool { if client.is_closed() { info!("pool: cached connection '{conn_info}' is closed, opening a new one"); return Ok(None); - } else { - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - client.session.send(ctx.session_id())?; - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner.get_process_id()), + ); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + client.session.send(ctx.session_id())?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } Ok(None) } @@ -463,7 +468,7 @@ impl GlobalConnPool { } } -pub fn poll_client( +pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, conn_info: ConnInfo, @@ -591,7 +596,7 @@ impl Drop for ClientInner { } } -pub trait ClientInnerExt: Sync + Send + 'static { +pub(crate) trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; } @@ -606,13 +611,13 @@ impl ClientInnerExt for tokio_postgres::Client { } impl ClientInner { - pub fn is_closed(&self) -> bool { + pub(crate) fn is_closed(&self) -> bool { self.inner.is_closed() } } impl Client { - pub fn metrics(&self) -> Arc { + pub(crate) fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, @@ -621,14 +626,14 @@ impl Client { } } -pub struct Client { +pub(crate) struct Client { span: Span, inner: Option>, conn_info: ConnInfo, pool: Weak>>, } -pub struct Discard<'a, C: ClientInnerExt> { +pub(crate) struct Discard<'a, C: ClientInnerExt> { conn_info: &'a ConnInfo, pool: &'a mut Weak>>, } @@ -646,7 +651,7 @@ impl Client { pool, } } - pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) { + pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, pool, @@ -654,18 +659,18 @@ impl Client { span: _, } = self; let inner = inner.as_mut().expect("client inner should not be removed"); - (&mut inner.inner, Discard { pool, conn_info }) + (&mut inner.inner, Discard { conn_info, pool }) } } impl Discard<'_, C> { - pub fn check_idle(&mut self, status: ReadyForQueryStatus) { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { let conn_info = &self.conn_info; if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { info!("pool: throwing away connection '{conn_info}' because connection is not idle"); } } - pub fn discard(&mut self) { + pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); @@ -716,7 +721,9 @@ impl Drop for Client { mod tests { use std::{mem, sync::atomic::AtomicBool}; - use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId}; + use crate::{ + proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, + }; use super::*; @@ -769,16 +776,18 @@ mod tests { }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, + max_request_size_bytes: u64::MAX, + max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { user_info: ComputeUserInfo { user: "user".into(), endpoint: "endpoint".into(), - options: Default::default(), + options: NeonOptions::default(), }, dbname: "dbname".into(), - password: "password".as_bytes().into(), + auth: AuthData::Password("password".as_bytes().into()), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), @@ -833,10 +842,10 @@ mod tests { user_info: ComputeUserInfo { user: "user".into(), endpoint: "endpoint-2".into(), - options: Default::default(), + options: NeonOptions::default(), }, dbname: "dbname".into(), - password: "password".as_bytes().into(), + auth: AuthData::Password("password".as_bytes().into()), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 701ab58f63..abf0ffe290 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -11,7 +11,7 @@ use serde::Serialize; use utils::http::error::ApiError; /// Like [`ApiError::into_response`] -pub fn api_error_into_response(this: ApiError) -> Response> { +pub(crate) fn api_error_into_response(this: ApiError) -> Response> { match this { ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#?}"), // use debug printing so that we give the cause @@ -59,7 +59,7 @@ pub fn api_error_into_response(this: ApiError) -> Response> { /// Same as [`utils::http::error::HttpErrorBody`] #[derive(Serialize)] struct HttpErrorBody { - pub msg: String, + pub(crate) msg: String, } impl HttpErrorBody { @@ -80,7 +80,7 @@ impl HttpErrorBody { } /// Same as [`utils::http::json::json_response`] -pub fn json_response( +pub(crate) fn json_response( status: StatusCode, data: T, ) -> Result>, ApiError> { diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index c22c63e85b..9f328a0e1d 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -8,7 +8,7 @@ use tokio_postgres::Row; // Convert json non-string types to strings, so that they can be passed to Postgres // as parameters. // -pub fn json_to_pg_text(json: Vec) -> Vec> { +pub(crate) fn json_to_pg_text(json: Vec) -> Vec> { json.iter().map(json_value_to_pg_text).collect() } @@ -55,13 +55,13 @@ fn json_array_to_pg_array(value: &Value) -> Option { .collect::>() .join(","); - Some(format!("{{{}}}", vals)) + Some(format!("{{{vals}}}")) } } } #[derive(Debug, thiserror::Error)] -pub enum JsonConversionError { +pub(crate) enum JsonConversionError { #[error("internal error compute returned invalid data: {0}")] AsTextError(tokio_postgres::Error), #[error("parse int error: {0}")] @@ -77,7 +77,7 @@ pub enum JsonConversionError { // // Convert postgres row with text-encoded values to JSON object // -pub fn pg_text_row_to_json( +pub(crate) fn pg_text_row_to_json( row: &Row, columns: &[Type], raw_output: bool, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index bbfed90f39..06e540d149 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -7,6 +7,7 @@ use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; +use http::header::AUTHORIZATION; use http_body_util::BodyExt; use http_body_util::Full; use hyper1::body::Body; @@ -56,6 +57,7 @@ use crate::DbName; use crate::RoleName; use super::backend::PoolingBackend; +use super::conn_pool::AuthData; use super::conn_pool::Client; use super::conn_pool::ConnInfo; use super::http_util::json_response; @@ -85,9 +87,7 @@ enum Payload { Batch(BatchQueryData), } -const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB -const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB - +static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); @@ -107,9 +107,9 @@ where } #[derive(Debug, thiserror::Error)] -pub enum ConnInfoError { +pub(crate) enum ConnInfoError { #[error("invalid header: {0}")] - InvalidHeader(&'static str), + InvalidHeader(&'static HeaderName), #[error("invalid connection string: {0}")] UrlParseError(#[from] url::ParseError), #[error("incorrect scheme")] @@ -153,10 +153,10 @@ fn get_conn_info( ctx.set_auth_method(crate::context::AuthMethod::Cleartext); let connection_string = headers - .get("Neon-Connection-String") - .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))? + .get(&CONN_STRING) + .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? .to_str() - .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?; + .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; let connection_url = Url::parse(connection_string)?; @@ -179,10 +179,23 @@ fn get_conn_info( } ctx.set_user(username.clone()); - let password = connection_url - .password() - .ok_or(ConnInfoError::MissingPassword)?; - let password = urlencoding::decode_binary(password.as_bytes()); + let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + let auth = auth + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; + AuthData::Jwt( + auth.strip_prefix("Bearer ") + .ok_or(ConnInfoError::MissingPassword)? + .into(), + ) + } else if let Some(pass) = connection_url.password() { + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }) + } else { + return Err(ConnInfoError::MissingPassword); + }; let endpoint = match connection_url.host() { Some(url::Host::Domain(hostname)) => { @@ -191,12 +204,12 @@ fn get_conn_info( .ok_or(ConnInfoError::MalformedEndpoint)? } else { hostname - .split_once(".") + .split_once('.') .map_or(hostname, |(prefix, _)| prefix) .into() } } - Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => { + Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { return Err(ConnInfoError::MissingHostname) } }; @@ -225,15 +238,12 @@ fn get_conn_info( Ok(ConnInfo { user_info, dbname, - password: match password { - std::borrow::Cow::Borrowed(b) => b.into(), - std::borrow::Cow::Owned(b) => b.into(), - }, + auth, }) } // TODO: return different http error codes -pub async fn handle( +pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestMonitoring, request: Request, @@ -346,17 +356,17 @@ pub async fn handle( } #[derive(Debug, thiserror::Error)] -pub enum SqlOverHttpError { +pub(crate) enum SqlOverHttpError { #[error("{0}")] ReadPayload(#[from] ReadPayloadError), #[error("{0}")] ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), - #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] - RequestTooLarge, - #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] - ResponseTooLarge, + #[error("request is too large (max is {0} bytes)")] + RequestTooLarge(u64), + #[error("response is too large (max is {0} bytes)")] + ResponseTooLarge(usize), #[error("invalid isolation level")] InvalidIsolationLevel, #[error("{0}")] @@ -373,8 +383,8 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), - SqlOverHttpError::RequestTooLarge => ErrorKind::User, - SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, @@ -389,8 +399,8 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), - SqlOverHttpError::RequestTooLarge => self.to_string(), - SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::RequestTooLarge(_) => self.to_string(), + SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), @@ -400,7 +410,7 @@ impl UserFacingError for SqlOverHttpError { } #[derive(Debug, thiserror::Error)] -pub enum ReadPayloadError { +pub(crate) enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] @@ -417,7 +427,7 @@ impl ReportableError for ReadPayloadError { } #[derive(Debug, thiserror::Error)] -pub enum SqlOverHttpCancel { +pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] Postgres, #[error("query was cancelled while stuck trying to connect to the database")] @@ -524,7 +534,7 @@ async fn handle_inner( let request_content_length = match request.body().size_hint().upper() { Some(v) => v, - None => MAX_REQUEST_SIZE + 1, + None => config.http_config.max_request_size_bytes + 1, }; info!(request_content_length, "request size in bytes"); Metrics::get() @@ -534,8 +544,10 @@ async fn handle_inner( // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body - if request_content_length > MAX_REQUEST_SIZE { - return Err(SqlOverHttpError::RequestTooLarge); + if request_content_length > config.http_config.max_request_size_bytes { + return Err(SqlOverHttpError::RequestTooLarge( + config.http_config.max_request_size_bytes, + )); } let fetch_and_process_request = Box::pin( @@ -550,9 +562,24 @@ async fn handle_inner( let authenticate_and_connect = Box::pin( async { - let keys = backend - .authenticate(ctx, &config.authentication_config, &conn_info) - .await?; + let keys = match &conn_info.auth { + AuthData::Password(pw) => { + backend + .authenticate_with_password( + ctx, + &config.authentication_config, + &conn_info.user_info, + pw, + ) + .await? + } + AuthData::Jwt(jwt) => { + backend + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .await? + } + }; + let client = backend .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; @@ -584,7 +611,10 @@ async fn handle_inner( // Now execute the query and return the result. let json_output = match payload { - Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, + Payload::Single(stmt) => { + stmt.process(config, cancel, &mut client, parsed_headers) + .await? + } Payload::Batch(statements) => { if parsed_headers.txn_read_only { response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); @@ -600,7 +630,7 @@ async fn handle_inner( } statements - .process(cancel, &mut client, parsed_headers) + .process(config, cancel, &mut client, parsed_headers) .await? } }; @@ -628,6 +658,7 @@ async fn handle_inner( impl QueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -636,7 +667,7 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), pin!(cancel.cancelled()), ) .await @@ -699,6 +730,7 @@ impl QueryData { impl BatchQueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -717,53 +749,58 @@ impl BatchQueryData { builder = builder.deferrable(true); } - let transaction = builder.start().await.map_err(|e| { + let transaction = builder.start().await.inspect_err(|_| { // if we cannot start a transaction, we should return immediately // and not return to the pool. connection is clearly broken discard.discard(); - e })?; - let json_output = - match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(json_output) => { - info!("commit"); - let status = transaction.commit().await.map_err(|e| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - json_output - } - Err(SqlOverHttpError::Cancelled(_)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + let json_output = match query_batch( + config, + cancel.child_token(), + &transaction, + self, + parsed_headers, + ) + .await + { + Ok(json_output) => { + info!("commit"); + let status = transaction.commit().await.inspect_err(|_| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error discard.discard(); + })?; + discard.check_idle(status); + json_output + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(err) => { - info!("rollback"); - let status = transaction.rollback().await.map_err(|e| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - return Err(err); - } - }; + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.inspect_err(|_| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + })?; + discard.check_idle(status); + return Err(err); + } + }; Ok(json_output) } } async fn query_batch( + config: &'static ProxyConfig, cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, @@ -773,6 +810,7 @@ async fn query_batch( let mut current_size = 0; for stmt in queries.queries { let query = pin!(query_to_json( + config, transaction, stmt, &mut current_size, @@ -801,6 +839,7 @@ async fn query_batch( } async fn query_to_json( + config: &'static ProxyConfig, client: &T, data: QueryData, current_size: &mut usize, @@ -821,8 +860,10 @@ async fn query_to_json( rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > MAX_RESPONSE_SIZE { - return Err(SqlOverHttpError::ResponseTooLarge); + if *current_size > config.http_config.max_response_size_bytes { + return Err(SqlOverHttpError::ResponseTooLarge( + config.http_config.max_response_size_bytes, + )); } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 4fba4d141c..3d257223b8 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -27,7 +27,7 @@ use tracing::warn; pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub(crate) struct WebSocketRw { #[pin] stream: WebSocketServer, recv: Bytes, @@ -36,7 +36,7 @@ pin_project! { } impl WebSocketRw { - pub fn new(stream: WebSocketServer) -> Self { + pub(crate) fn new(stream: WebSocketServer) -> Self { Self { stream, recv: Bytes::new(), @@ -127,7 +127,7 @@ impl AsyncBufRead for WebSocketRw { } } -pub async fn serve_websocket( +pub(crate) async fn serve_websocket( config: &'static ProxyConfig, ctx: RequestMonitoring, websocket: OnUpgrade, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 7809d2e574..e2fc73235e 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -12,8 +12,10 @@ use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; +use tracing::debug; /// Stream wrapper which implements libpq's protocol. +/// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying /// to pass random malformed bytes through the connection). @@ -35,7 +37,7 @@ impl PqStream { } /// Get a shared reference to the underlying stream. - pub fn get_ref(&self) -> &S { + pub(crate) fn get_ref(&self) -> &S { self.framed.get_ref() } } @@ -62,12 +64,12 @@ impl PqStream { .ok_or_else(err_connection) } - pub async fn read_password_message(&mut self) -> io::Result { + pub(crate) async fn read_password_message(&mut self) -> io::Result { match self.read_message().await? { FeMessage::PasswordMessage(msg) => Ok(msg), bad => Err(io::Error::new( io::ErrorKind::InvalidData, - format!("unexpected message type: {:?}", bad), + format!("unexpected message type: {bad:?}"), )), } } @@ -99,7 +101,10 @@ impl ReportableError for ReportedError { impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { + pub(crate) fn write_message_noflush( + &mut self, + message: &BeMessage<'_>, + ) -> io::Result<&mut Self> { self.framed .write_message(message) .map_err(ProtocolError::into_io_error)?; @@ -114,7 +119,7 @@ impl PqStream { } /// Flush the output buffer into the underlying stream. - pub async fn flush(&mut self) -> io::Result<&mut Self> { + pub(crate) async fn flush(&mut self) -> io::Result<&mut Self> { self.framed.flush().await?; Ok(self) } @@ -134,9 +139,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(msg), @@ -146,7 +152,7 @@ impl PqStream { /// Write the error message using [`Self::write_message`], then re-throw it. /// Trait [`UserFacingError`] acts as an allowlist for error types. - pub async fn throw_error(&mut self, error: E) -> Result + pub(crate) async fn throw_error(&mut self, error: E) -> Result where E: UserFacingError + Into, { @@ -160,9 +166,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(&msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(&msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(error), @@ -200,7 +207,7 @@ impl Stream { } } - pub fn tls_server_end_point(&self) -> TlsServerEndPoint { + pub(crate) fn tls_server_end_point(&self) -> TlsServerEndPoint { match self { Stream::Raw { .. } => TlsServerEndPoint::Undefined, Stream::Tls { diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 202fe8de1f..270cd7c24d 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -7,12 +7,12 @@ pub struct ApiUrl(url::Url); impl ApiUrl { /// Consume the wrapper and return inner [url](url::Url). - pub fn into_inner(self) -> url::Url { + pub(crate) fn into_inner(self) -> url::Url { self.0 } /// See [`url::Url::path_segments_mut`]. - pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> { + pub(crate) fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> { // We've already verified that it works during construction. self.0.path_segments_mut().expect("bad API url") } @@ -57,7 +57,7 @@ mod tests { fn bad_url() { let url = "test:foobar"; url.parse::().expect("unexpected parsing failure"); - let _ = url.parse::().expect_err("should not parse"); + url.parse::().expect_err("should not parse"); } #[test] diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index a8735fe0bb..fd8599bcb3 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; -const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); +const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, @@ -43,12 +44,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] -pub struct Ids { - pub endpoint_id: EndpointIdInt, - pub branch_id: BranchIdInt, +pub(crate) struct Ids { + pub(crate) endpoint_id: EndpointIdInt, + pub(crate) branch_id: BranchIdInt, } -pub trait MetricCounterRecorder { +pub(crate) trait MetricCounterRecorder { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64); /// Record that some connections were opened @@ -92,7 +93,7 @@ impl MetricCounterReporter for MetricBackupCounter { } #[derive(Debug)] -pub struct MetricCounter { +pub(crate) struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, backup: Arc, @@ -173,14 +174,14 @@ impl Clearable for C { type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] -pub struct Metrics { +pub(crate) struct Metrics { endpoints: DashMap, FastHasher>, backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint - pub fn register(&self, ids: Ids) -> Arc { + pub(crate) fn register(&self, ids: Ids) -> Arc { let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { entry.clone() } else { @@ -215,7 +216,7 @@ impl Metrics { } } -pub static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); +pub(crate) static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result { info!("metrics collector config: {config:?}"); @@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result<()> { - let storage = match storage { - Some(storage) => storage, - None => { - error!("no remote storage configured"); - return Ok(()); - } + let Some(storage) = storage else { + error!("no remote storage configured"); + return Ok(()); }; let data = serde_json::to_vec(&chunk).context("serialize metrics")?; let mut encoder = GzipEncoder::new(Vec::new()); diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 3bd8f4c8ef..86d0f9e8b2 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -7,13 +7,13 @@ use thiserror::Error; use tokio::sync::oneshot; #[derive(Debug, Error)] -pub enum RegisterError { +pub(crate) enum RegisterError { #[error("Waiter `{0}` already registered")] Occupied(String), } #[derive(Debug, Error)] -pub enum NotifyError { +pub(crate) enum NotifyError { #[error("Notify failed: waiter `{0}` not registered")] NotFound(String), @@ -22,21 +22,21 @@ pub enum NotifyError { } #[derive(Debug, Error)] -pub enum WaitError { +pub(crate) enum WaitError { #[error("Wait failed: channel hangup")] Hangup, } -pub struct Waiters(pub(self) Mutex>>); +pub(crate) struct Waiters(pub(self) Mutex>>); impl Default for Waiters { fn default() -> Self { - Waiters(Default::default()) + Waiters(Mutex::default()) } } impl Waiters { - pub fn register(&self, key: String) -> Result, RegisterError> { + pub(crate) fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 @@ -53,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> + pub(crate) fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -79,7 +79,7 @@ impl<'a, T> Drop for DropKey<'a, T> { } pin_project! { - pub struct Waiter<'a, T> { + pub(crate) struct Waiter<'a, T> { #[pin] receiver: oneshot::Receiver, guard: DropKey<'a, T>, diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 368b8d300a..3c5d0b12a6 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,7 +1,7 @@ [toolchain] -channel = "1.80.1" +channel = "1.81.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html -# but we also need `llvm-tools-preview` for coverage data merges on CI -components = ["llvm-tools-preview", "rustfmt", "clippy"] +# but we also need `llvm-tools` for coverage data merges on CI +components = ["llvm-tools", "rustfmt", "clippy"] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0fdb3147bf..daf21c70b0 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -13,14 +13,12 @@ testing = ["fail/failpoints"] [dependencies] async-stream.workspace = true anyhow.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive"] } -const_format.workspace = true crc32c.workspace = true fail.workspace = true git-version.workspace = true @@ -38,8 +36,6 @@ scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -signal-hook.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true @@ -48,7 +44,6 @@ tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-tar.workspace = true -toml_edit.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index b8bc3f3e06..c5c9393c00 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -1,6 +1,9 @@ use utils::auth::{AuthError, Claims, Scope}; use utils::id::TenantId; +/// If tenant_id is provided, allow if token (claims) is for this tenant or +/// whole safekeeper scope (SafekeeperData). Else, allow only if token is +/// SafekeeperData. pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<(), AuthError> { match (&claims.scope, tenant_id) { (Scope::Tenant, None) => Err(AuthError( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 41c2d3fe08..5270934f5e 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -19,7 +19,7 @@ use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use storage_broker::Uri; use tracing::*; @@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> { // Change into the data directory. std::env::set_current_dir(&workdir)?; + // Prevent running multiple safekeepers on the same directory + let lock_file_path = workdir.join(PID_FILE_NAME); + let lock_file = + pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; + info!("claimed pid file at {lock_file_path:?}"); + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + std::mem::forget(lock_file); + // Set or read our ID. let id = set_id(&workdir, args.id.map(NodeId))?; if args.init { @@ -364,15 +373,15 @@ async fn main() -> anyhow::Result<()> { type JoinTaskRes = Result, JoinError>; async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { - // Prevent running multiple safekeepers on the same directory - let lock_file_path = conf.workdir.join(PID_FILE_NAME); - let lock_file = - pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; - info!("claimed pid file at {lock_file_path:?}"); - - // ensure that the lock file is held even if the main thread of the process is panics - // we need to release the lock file only when the current process is gone - std::mem::forget(lock_file); + // fsync the datadir to make sure we have a consistent state on disk. + let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; + let started = Instant::now(); + utils::crashsafe::syncfs(dfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "syncfs data directory done" + ); info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 7cc2142291..485816408f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -86,7 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -#[instrument(name = "broker pull", skip_all)] +#[instrument(name = "broker_pull", skip_all)] async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index c551cd3122..8b252b4ab4 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -7,6 +7,7 @@ use tokio::fs::File; use tokio::io::AsyncWriteExt; use utils::crashsafe::durable_rename; +use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; @@ -31,10 +32,9 @@ pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. -#[async_trait::async_trait] pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>; + fn persist(&mut self, s: &TimelinePersistentState) -> impl Future> + Send; /// Timestamp of last persist. fn last_persist_at(&self) -> Instant; @@ -188,7 +188,6 @@ impl TimelinePersistentState { } } -#[async_trait::async_trait] impl Storage for FileStorage { /// Persists state durably to the underlying storage. /// diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c519433ef..3f00b69cde 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,6 +2,7 @@ //! protocol commands. use anyhow::Context; +use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { } } -#[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler { @@ -197,49 +197,51 @@ impl postgres_backend::Handler Ok(()) } - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError> { - if query_string - .to_ascii_lowercase() - .starts_with("set datestyle to ") - { - // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - return Ok(()); - } - - let cmd = parse_cmd(query_string)?; - let cmd_str = cmd_to_string(&cmd); - - let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); - - info!("got query {:?}", query_string); - - let tenant_id = self.tenant_id.context("tenantid is required")?; - let timeline_id = self.timeline_id.context("timelineid is required")?; - self.check_permission(Some(tenant_id))?; - self.ttid = TenantTimelineId::new(tenant_id, timeline_id); - - match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) - .instrument(info_span!("WAL receiver")) - .await + ) -> impl Future> { + Box::pin(async move { + if query_string + .to_ascii_lowercase() + .starts_with("set datestyle to ") + { + // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + return Ok(()); } - SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { - self.handle_start_replication(pgb, start_lsn, term) - .instrument(info_span!("WAL sender")) - .await + + let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); + + info!("got query {:?}", query_string); + + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.check_permission(Some(tenant_id))?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + + match cmd { + SafekeeperPostgresCommand::StartWalPush => { + self.handle_start_wal_push(pgb) + .instrument(info_span!("WAL receiver")) + .await + } + SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { + self.handle_start_replication(pgb, start_lsn, term) + .instrument(info_span!("WAL sender")) + .await + } + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { + handle_json_ctrl(self, pgb, cmd).await + } } - SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, - SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } - } + }) } } diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml index a617e0310c..70999853c2 100644 --- a/safekeeper/src/http/openapi_spec.yaml +++ b/safekeeper/src/http/openapi_spec.yaml @@ -86,42 +86,6 @@ paths: default: $ref: "#/components/responses/GenericError" - /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: source_timeline_id - in: path - required: true - schema: - type: string - format: hex - - post: - tags: - - "Timeline" - summary: Register new timeline as copy of existing timeline - description: "" - operationId: v1CopyTenantTimeline - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/TimelineCopyRequest" - responses: - "201": - description: Timeline created - # TODO: return timeline info? - "403": - $ref: "#/components/responses/ForbiddenError" - default: - $ref: "#/components/responses/GenericError" - - /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -179,6 +143,40 @@ paths: default: $ref: "#/components/responses/GenericError" + /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: source_timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline as copy of existing timeline + description: "" + operationId: v1CopyTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCopyRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: parameters: diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index d11815f6ef..e482edea55 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri use utils::http::request::parse_query_param; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::TimelineCreateRequest; use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; +use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; use utils::{ auth::SwappableJwtAuth, http::{ @@ -114,7 +114,55 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res }) } +/// Deactivates all timelines for the tenant and removes its data directory. +/// See `timeline_delete_handler`. +async fn tenant_delete_handler(mut request: Request) -> Result, ApiError> { + let tenant_id = parse_request_param(&request, "tenant_id")?; + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); + check_permission(&request, Some(tenant_id))?; + ensure_no_body(&mut request).await?; + // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; + // Using an `InternalServerError` should be fixed when the types support it + let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local) + .await + .map_err(ApiError::InternalServerError)?; + json_response( + StatusCode::OK, + delete_info + .iter() + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) + .collect::>(), + ) +} + +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + + let ttid = TenantTimelineId { + tenant_id: request_data.tenant_id, + timeline_id: request_data.timeline_id, + }; + check_permission(&request, Some(ttid.tenant_id))?; + + let server_info = ServerInfo { + pg_version: request_data.pg_version, + system_id: request_data.system_id.unwrap_or(0), + wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32), + }; + let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| { + request_data + .commit_lsn + .segment_lsn(server_info.wal_seg_size as usize) + }); + GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + /// List all (not deleted) timelines. +/// Note: it is possible to do the same with debug_dump. async fn timeline_list_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; let res: Vec = GlobalTimelines::get_all() @@ -174,30 +222,21 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result, ApiError> { - let request_data: TimelineCreateRequest = json_request(&mut request).await?; - - let ttid = TenantTimelineId { - tenant_id: request_data.tenant_id, - timeline_id: request_data.timeline_id, - }; +/// Deactivates the timeline and removes its data directory. +async fn timeline_delete_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); check_permission(&request, Some(ttid.tenant_id))?; - - let server_info = ServerInfo { - pg_version: request_data.pg_version, - system_id: request_data.system_id.unwrap_or(0), - wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32), - }; - let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| { - request_data - .commit_lsn - .segment_lsn(server_info.wal_seg_size as usize) - }); - GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn) + ensure_no_body(&mut request).await?; + // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better + // error handling here when we're able to. + let resp = GlobalTimelines::delete(&ttid, only_local) .await .map_err(ApiError::InternalServerError)?; - - json_response(StatusCode::OK, ()) + json_response(StatusCode::OK, resp) } /// Pull timeline from peer safekeeper instances. @@ -279,6 +318,46 @@ async fn timeline_copy_handler(mut request: Request) -> Result, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let patch_request: patch_control_file::Request = json_request(&mut request).await?; + let response = patch_control_file::handle_request(tli, patch_request) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + +/// Force persist control file. +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid)?; + tli.write_shared_state() + .await + .sk + .state_mut() + .flush() + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) +} + async fn timeline_digest_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, @@ -310,62 +389,45 @@ async fn timeline_digest_handler(request: Request) -> Result) -> Result, ApiError> { - check_permission(&request, None)?; - +/// Unevict timeline and remove uploaded partial segment(s) from the remote storage. +/// Successfull response returns list of segments existed before the deletion. +/// Aimed for one-off usage not normally needed. +async fn timeline_backup_partial_reset(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - - let tli = GlobalTimelines::get(ttid)?; - tli.write_shared_state() - .await - .sk - .state_mut() - .flush() - .await - .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, ()) -} - -/// Deactivates the timeline and removes its data directory. -async fn timeline_delete_handler(mut request: Request) -> Result, ApiError> { - let ttid = TenantTimelineId::new( - parse_request_param(&request, "tenant_id")?, - parse_request_param(&request, "timeline_id")?, - ); - let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); check_permission(&request, Some(ttid.tenant_id))?; - ensure_no_body(&mut request).await?; - // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better - // error handling here when we're able to. - let resp = GlobalTimelines::delete(&ttid, only_local) + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let response = tli + .backup_partial_reset() .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, resp) + json_response(StatusCode::OK, response) } -/// Deactivates all timelines for the tenant and removes its data directory. -/// See `timeline_delete_handler`. -async fn tenant_delete_handler(mut request: Request) -> Result, ApiError> { - let tenant_id = parse_request_param(&request, "tenant_id")?; - let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); - check_permission(&request, Some(tenant_id))?; - ensure_no_body(&mut request).await?; - // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; - // Using an `InternalServerError` should be fixed when the types support it - let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local) +/// Make term at least as high as one in request. If one in request is None, +/// increment current one. +async fn timeline_term_bump_handler( + mut request: Request, +) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let request_data: TimelineTermBumpRequest = json_request(&mut request).await?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + let response = tli + .term_bump(request_data.term) .await .map_err(ApiError::InternalServerError)?; - json_response( - StatusCode::OK, - delete_info - .iter() - .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) - .collect::>(), - ) + + json_response(StatusCode::OK, response) } /// Used only in tests to hand craft required data. @@ -509,26 +571,6 @@ async fn dump_debug_handler(mut request: Request) -> Result Ok(response) } -async fn patch_control_file_handler( - mut request: Request, -) -> Result, ApiError> { - check_permission(&request, None)?; - - let ttid = TenantTimelineId::new( - parse_request_param(&request, "tenant_id")?, - parse_request_param(&request, "timeline_id")?, - ); - - let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; - - let patch_request: patch_control_file::Request = json_request(&mut request).await?; - let response = patch_control_file::handle_request(tli, patch_request) - .await - .map_err(ApiError::InternalServerError)?; - - json_response(StatusCode::OK, response) -} - /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); @@ -568,6 +610,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder failpoints_handler(r, cancel).await }) }) + .delete("/v1/tenant/:tenant_id", |r| { + request_span(r, tenant_delete_handler) + }) // Will be used in the future instead of implicit timeline creation .post("/v1/tenant/timeline", |r| { request_span(r, timeline_create_handler) @@ -581,16 +626,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { request_span(r, timeline_delete_handler) }) - .delete("/v1/tenant/:tenant_id", |r| { - request_span(r, tenant_delete_handler) + .post("/v1/pull_timeline", |r| { + request_span(r, timeline_pull_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) - .post("/v1/pull_timeline", |r| { - request_span(r, timeline_pull_handler) - }) .post( "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), @@ -603,14 +645,21 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", |r| request_span(r, timeline_checkpoint_handler), ) - // for tests + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { + request_span(r, timeline_digest_handler) + }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset", + |r| request_span(r, timeline_backup_partial_reset), + ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump", + |r| request_span(r, timeline_term_bump_handler), + ) .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { - request_span(r, timeline_digest_handler) - }) } #[cfg(test)] diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 1eacec9981..64585f5edc 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -183,10 +183,10 @@ impl WalResidentTimeline { "Replacing uploaded partial segment in in-mem control file: {replace:?}" ); - let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?; + let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( - &replace.previous.remote_path(&remote_timeline_path), - &replace.current.remote_path(&remote_timeline_path), + &replace.previous.remote_path(remote_timeline_path), + &replace.current.remote_path(remote_timeline_path), ) .await?; } @@ -484,6 +484,7 @@ pub async fn validate_temp_timeline( } /// Move timeline from a temp directory to the main storage, and load it to the global map. +/// /// This operation is done under a lock to prevent bugs if several concurrent requests are /// trying to load the same timeline. Note that it doesn't guard against creating the /// timeline with the same ttid, but no one should be doing this anyway. diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index ab8c76dc17..e35f806e90 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -448,8 +448,10 @@ async fn network_write( const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// Encapsulates a task which takes messages from msg_rx, processes and pushes -/// replies to reply_tx; reading from socket and writing to disk in parallel is -/// beneficial for performance, this struct provides writing to disk part. +/// replies to reply_tx. +/// +/// Reading from socket and writing to disk in parallel is beneficial for +/// performance, this struct provides the writing to disk part. pub struct WalAcceptor { tli: WalResidentTimeline, msg_rx: Receiver, diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index a59ff07b96..9c4149d8f1 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -35,7 +35,7 @@ use crate::{ /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. -#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) { info!("started"); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 0814d9ba67..b3e006ab05 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -875,6 +875,29 @@ where return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } + // Disallow any non-sequential writes, which can result in gaps or + // overwrites. If we need to move the pointer, ProposerElected message + // should have truncated WAL first accordingly. Note that the first + // condition (WAL rewrite) is quite expected in real world; it happens + // when walproposer reconnects to safekeeper and writes some more data + // while first connection still gets some packets later. It might be + // better to not log this as error! above. + let write_lsn = self.wal_store.write_lsn(); + if write_lsn > msg.h.begin_lsn { + bail!( + "append request rewrites WAL written before, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn + ); + } + if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) { + bail!( + "append request creates gap in written WAL, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn, + ); + } + // Now we know that we are in the same term as the proposer, // processing the message. @@ -915,8 +938,9 @@ where } trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", msg.wal_data.len(), + msg.h.begin_lsn, msg.h.end_lsn, msg.h.commit_lsn, msg.h.truncate_lsn, @@ -960,10 +984,7 @@ mod tests { use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use super::*; - use crate::{ - state::{EvictionState, PersistedPeers, TimelinePersistentState}, - wal_storage::Storage, - }; + use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; use std::{ops::Deref, str::FromStr, time::Instant}; // fake storage for tests @@ -971,7 +992,6 @@ mod tests { persisted_state: TimelinePersistentState, } - #[async_trait::async_trait] impl control_file::Storage for InMemoryState { async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { self.persisted_state = s.clone(); @@ -1003,8 +1023,11 @@ mod tests { lsn: Lsn, } - #[async_trait::async_trait] impl wal_storage::Storage for DummyWalStore { + fn write_lsn(&self) -> Lsn { + self.lsn + } + fn flush_lsn(&self) -> Lsn { self.lsn } @@ -1078,7 +1101,7 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { - term: 1, + term: 2, term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), @@ -1092,24 +1115,29 @@ mod tests { }; let pem = ProposerElected { - term: 1, - start_streaming_at: Lsn(3), - term_history: TermHistory(vec![TermLsn { - term: 1, - lsn: Lsn(3), - }]), - timeline_start_lsn: Lsn(0), + term: 2, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![ + TermLsn { + term: 1, + lsn: Lsn(1), + }, + TermLsn { + term: 2, + lsn: Lsn(3), + }, + ]), + timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); // check that AppendRequest before term_start_lsn doesn't switch last_log_term. - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - assert_eq!(sk.get_last_log_term(), 0); + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 1); // but record at term_start_lsn does the switch ar_hdr.begin_lsn = Lsn(2); @@ -1118,12 +1146,63 @@ mod tests { h: ar_hdr, wal_data: Bytes::from_static(b"b"), }; - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_last_log_term(), 1); + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 2); + } + + #[tokio::test] + async fn test_non_consecutive_write() { + let storage = InMemoryState { + persisted_state: test_sk_state(), + }; + let wal_store = DummyWalStore { lsn: Lsn(0) }; + + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + + let pem = ProposerElected { + term: 1, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![TermLsn { + term: 1, + lsn: Lsn(1), + }]), + timeline_start_lsn: Lsn(1), + }; + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .await + .unwrap(); + + let ar_hdr = AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(3), + begin_lsn: Lsn(1), + end_lsn: Lsn(2), + commit_lsn: Lsn(0), + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }; + let append_request = AppendRequest { + h: ar_hdr.clone(), + wal_data: Bytes::from_static(b"b"), + }; + + // do write ending at 2, it should be ok + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + let mut ar_hrd2 = ar_hdr.clone(); + ar_hrd2.begin_lsn = Lsn(4); + ar_hrd2.end_lsn = Lsn(5); + let append_request = AppendRequest { + h: ar_hdr, + wal_data: Bytes::from_static(b"b"), + }; + // and now starting at 4, it must fail + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap_err(); } #[test] diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 90b1604adb..6d677f405a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -758,9 +758,8 @@ impl ReplyReader { // pq_sendint32(&reply_message, xmin); // pq_sendint32(&reply_message, xmin_epoch); // So it is two big endian 32-bit words in low endian order! - hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); - hs_feedback.catalog_xmin = - (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); + hs_feedback.xmin = hs_feedback.xmin.rotate_left(32); + hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index dca6414082..8ae749ded5 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,9 +1,10 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::ops::Deref; +use std::{cmp::max, ops::Deref}; use anyhow::Result; +use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -12,7 +13,7 @@ use utils::{ use crate::{ control_file, - safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory}, wal_backup_partial::{self}, }; @@ -147,9 +148,11 @@ pub struct TimelineMemState { pub proposer_uuid: PgUuid, } -/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs -/// when we update fields like commit_lsn which don't need immediate -/// persistence. Provides transactional like API to atomically update the state. +/// Safekeeper persistent state plus in memory layer. +/// +/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn +/// which don't need immediate persistence. Provides transactional like API +/// to atomically update the state. /// /// Implements Deref into *persistent* part. pub struct TimelineState { @@ -209,6 +212,27 @@ where let s = self.start_change(); self.finish_change(&s).await } + + /// Make term at least as `to`. If `to` is None, increment current one. This + /// is not in safekeeper.rs because we want to be able to do it even if + /// timeline is offloaded. + pub async fn term_bump(&mut self, to: Option) -> Result { + let before = self.acceptor_state.term; + let mut state = self.start_change(); + let new = match to { + Some(to) => max(state.acceptor_state.term, to), + None => state.acceptor_state.term + 1, + }; + if new > state.acceptor_state.term { + state.acceptor_state.term = new; + self.finish_change(&state).await?; + } + let after = self.acceptor_state.term; + Ok(TimelineTermBumpResponse { + previous_term: before, + current_term: after, + }) + } } impl Deref for TimelineState diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 57935d879f..fb98534768 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,6 +3,8 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; +use remote_storage::RemotePath; +use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; @@ -36,7 +38,7 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self}; +use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; @@ -168,6 +170,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> { } /// This structure is stored in shared state and represents the state of the timeline. +/// /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this /// case, SafeKeeper is not available (because WAL is not present on disk) and all /// operations can be done only with control file. @@ -213,6 +216,10 @@ impl StateSK { .get_last_log_term(self.flush_lsn()) } + pub async fn term_bump(&mut self, to: Option) -> Result { + self.state_mut().term_bump(to).await + } + /// Close open WAL files to release FDs. fn close_wal_store(&mut self) { if let StateSK::Loaded(sk) = self { @@ -469,6 +476,7 @@ impl From for ApiError { /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { pub ttid: TenantTimelineId, + pub remote_path: RemotePath, /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, @@ -519,8 +527,10 @@ impl Timeline { let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, @@ -557,8 +567,10 @@ impl Timeline { TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, @@ -847,6 +859,11 @@ impl Timeline { Ok(res) } + pub async fn term_bump(self: &Arc, to: Option) -> Result { + let mut state = self.write_shared_state().await; + state.sk.term_bump(to).await + } + /// Get the timeline guard for reading/writing WAL files. /// If WAL files are not present on disk (evicted), they will be automatically /// downloaded from remote storage. This is done in the manager task, which is @@ -902,6 +919,10 @@ impl Timeline { Ok(WalResidentTimeline::new(self.clone(), guard)) } + + pub async fn backup_partial_reset(self: &Arc) -> Result> { + self.manager_ctl.backup_partial_reset().await + } } /// This is a guard that allows to read/write disk timeline state. diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index ae6f3f4b7e..5aa4921a92 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -1,6 +1,8 @@ -//! Code related to evicting WAL files to remote storage. The actual upload is done by the -//! partial WAL backup code. This file has code to delete and re-download WAL files, -//! cross-validate with partial WAL backup if local file is still present. +//! Code related to evicting WAL files to remote storage. +//! +//! The actual upload is done by the partial WAL backup code. This file has +//! code to delete and re-download WAL files, cross-validate with partial WAL +//! backup if local file is still present. use anyhow::Context; use camino::Utf8PathBuf; @@ -28,28 +30,38 @@ impl Manager { /// - control file is flushed (no next event scheduled) /// - no WAL residence guards /// - no pushes to the broker - /// - partial WAL backup is uploaded + /// - last partial WAL segment is uploaded + /// - all local segments before the uploaded partial are committed and uploaded pub(crate) fn ready_for_eviction( &self, next_event: &Option, state: &StateSnapshot, ) -> bool { - self.backup_task.is_none() + let ready = self.backup_task.is_none() && self.recovery_task.is_none() && self.wal_removal_task.is_none() && self.partial_backup_task.is_none() - && self.partial_backup_uploaded.is_some() && next_event.is_none() && self.access_service.is_empty() && !self.tli_broker_active.get() + // Partial segment of current flush_lsn is uploaded up to this flush_lsn. && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) + // And it is the next one after the last removed. Given that local + // WAL is removed only after it is uploaded to s3 (and pageserver + // advancing remote_consistent_lsn) which happens only after WAL is + // committed, true means all this is done. + // + // This also works for the first segment despite last_removed_segno + // being 0 on init because this 0 triggers run of wal_removal_task + // on success of which manager updates the horizon. && self .partial_backup_uploaded .as_ref() .unwrap() .flush_lsn .segment_number(self.wal_seg_size) - == self.last_removed_segno + 1 + == self.last_removed_segno + 1; + ready } /// Evict the timeline to remote storage. @@ -83,7 +95,8 @@ impl Manager { info!("successfully evicted timeline"); } - /// Restore evicted timeline from remote storage. + /// Attempt to restore evicted timeline from remote storage; it must be + /// offloaded. #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); @@ -167,7 +180,7 @@ async fn redownload_partial_segment( partial: &PartialRemoteSegment, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); - let remote_segfile = remote_segment_path(mgr, partial)?; + let remote_segfile = remote_segment_path(mgr, partial); debug!( "redownloading partial segment: {} -> {}", @@ -252,7 +265,7 @@ async fn do_validation( ); } - let remote_segfile = remote_segment_path(mgr, partial)?; + let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = wal_backup::read_object(&remote_segfile, 0).await?; @@ -279,12 +292,8 @@ fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8Path local_partial_segfile } -fn remote_segment_path( - mgr: &Manager, - partial: &PartialRemoteSegment, -) -> anyhow::Result { - let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?; - Ok(partial.remote_path(&remote_timeline_path)) +fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath { + partial.remote_path(&mgr.tli.remote_path) } /// Compare first `n` bytes of two readers. If the bytes differ, return an error. diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index dbdf46412d..1ddac573d2 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -1,4 +1,6 @@ -//! Timeline residence guard is needed to ensure that WAL segments are present on disk, +//! Timeline residence guard +//! +//! It is needed to ensure that WAL segments are present on disk, //! as long as the code is holding the guard. This file implements guard logic, to issue //! and drop guards, and to notify the manager when the guard is dropped. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 482614fac7..6be75479db 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -1,4 +1,5 @@ //! The timeline manager task is responsible for managing the timeline's background tasks. +//! //! It is spawned alongside each timeline and exits when the timeline is deleted. //! It watches for changes in the timeline state and decides when to spawn or kill background tasks. //! It also can manage some reactive state, like should the timeline be active for broker pushes or not. @@ -11,12 +12,14 @@ use std::{ time::Duration, }; +use futures::channel::oneshot; use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; use tokio::{ task::{JoinError, JoinHandle}, time::Instant, }; +use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, instrument, warn, Instrument}; use utils::lsn::Lsn; @@ -33,7 +36,7 @@ use crate::{ timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialRemoteSegment}, + wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, SafeKeeperConf, }; @@ -96,6 +99,8 @@ pub enum ManagerCtlMessage { GuardRequest(tokio::sync::oneshot::Sender>), /// Request to drop the guard. GuardDrop(GuardId), + /// Request to reset uploaded partial backup state. + BackupPartialReset(oneshot::Sender>>), } impl std::fmt::Debug for ManagerCtlMessage { @@ -103,6 +108,7 @@ impl std::fmt::Debug for ManagerCtlMessage { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } } } @@ -143,6 +149,19 @@ impl ManagerCtl { .and_then(std::convert::identity) } + /// Request timeline manager to reset uploaded partial segment state and + /// wait for the result. + pub async fn backup_partial_reset(&self) -> anyhow::Result> { + let (tx, rx) = oneshot::channel(); + self.manager_tx + .send(ManagerCtlMessage::BackupPartialReset(tx)) + .expect("manager task is not running"); + match rx.await { + Ok(res) => res, + Err(_) => anyhow::bail!("timeline manager is gone"), + } + } + /// Must be called exactly once to bootstrap the manager. pub fn bootstrap_manager( &self, @@ -181,7 +200,8 @@ pub(crate) struct Manager { pub(crate) wal_removal_task: Option>>, // partial backup - pub(crate) partial_backup_task: Option>>, + pub(crate) partial_backup_task: + Option<(JoinHandle>, CancellationToken)>, pub(crate) partial_backup_uploaded: Option, // misc @@ -302,12 +322,12 @@ pub async fn main_task( _ = sleep_until(&next_event) => { // we were waiting for some event (e.g. cfile save) } - res = await_task_finish(&mut mgr.wal_removal_task) => { + res = await_task_finish(mgr.wal_removal_task.as_mut()) => { // WAL removal task finished mgr.wal_removal_task = None; mgr.update_wal_removal_end(res); } - res = await_task_finish(&mut mgr.partial_backup_task) => { + res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => { // partial backup task finished mgr.partial_backup_task = None; mgr.update_partial_backup_end(res); @@ -335,8 +355,9 @@ pub async fn main_task( } } - if let Some(partial_backup_task) = &mut mgr.partial_backup_task { - if let Err(e) = partial_backup_task.await { + if let Some((handle, cancel)) = &mut mgr.partial_backup_task { + cancel.cancel(); + if let Err(e) = handle.await { warn!("partial backup task failed: {:?}", e); } } @@ -560,11 +581,14 @@ impl Manager { } // Get WalResidentTimeline and start partial backup task. - self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( + let cancel = CancellationToken::new(); + let handle = tokio::spawn(wal_backup_partial::main_task( self.wal_resident_timeline(), self.conf.clone(), self.global_rate_limiter.clone(), - ))); + cancel.clone(), + )); + self.partial_backup_task = Some((handle, cancel)); } /// Update the state after partial WAL backup task finished. @@ -579,6 +603,39 @@ impl Manager { } } + /// Reset partial backup state and remove its remote storage data. Since it + /// might concurrently uploading something, cancel the task first. + async fn backup_partial_reset(&mut self) -> anyhow::Result> { + info!("resetting partial backup state"); + // Force unevict timeline if it is evicted before erasing partial backup + // state. The intended use of this function is to drop corrupted remote + // state; we haven't enabled local files deletion yet anywhere, + // so direct switch is safe. + if self.is_offloaded { + self.tli.switch_to_present().await?; + // switch manager state as soon as possible + self.is_offloaded = false; + } + + if let Some((handle, cancel)) = &mut self.partial_backup_task { + cancel.cancel(); + info!("cancelled partial backup task, awaiting it"); + // we're going to reset .partial_backup_uploaded to None anyway, so ignore the result + handle.await.ok(); + self.partial_backup_task = None; + } + + let tli = self.wal_resident_timeline(); + let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + // Reset might fail e.g. when cfile is already reset but s3 removal + // failed, so set manager state to None beforehand. In any case caller + // is expected to retry until success. + self.partial_backup_uploaded = None; + let res = partial_backup.reset().await?; + info!("reset is done"); + Ok(res) + } + /// Handle message arrived from ManagerCtl. async fn handle_message(&mut self, msg: Option) { debug!("received manager message: {:?}", msg); @@ -602,6 +659,16 @@ impl Manager { Some(ManagerCtlMessage::GuardDrop(guard_id)) => { self.access_service.drop_guard(guard_id); } + Some(ManagerCtlMessage::BackupPartialReset(tx)) => { + info!("resetting uploaded partial backup state"); + let res = self.backup_partial_reset().await; + if let Err(ref e) = res { + warn!("failed to reset partial backup state: {:?}", e); + } + if tx.send(res).is_err() { + warn!("failed to send partial backup reset result, receiver dropped"); + } + } None => { // can't happen, we're holding the sender unreachable!(); @@ -619,7 +686,11 @@ async fn sleep_until(option: &Option) { } } -async fn await_task_finish(option: &mut Option>) -> Result { +/// Future that resolves when the task is finished or never if the task is None. +/// +/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the +/// option to get the latter is hard. +async fn await_task_finish(option: Option<&mut JoinHandle>) -> Result { if let Some(task) = option { task.await } else { diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index d6eea79f82..096e348295 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -60,7 +60,8 @@ impl TimelinesSet { } } -/// Guard is used to add or remove timeline from the set. +/// Guard is used to add or remove timelines from the set. +/// /// If the timeline present in set, it will be removed from it on drop. /// Note: do not use more than one guard for the same timeline, it caches the presence state. /// It is designed to be used in the manager task only. diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index aa1a6696a1..ef26ac99c5 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -203,7 +203,7 @@ struct WalBackupTask { } /// Offload single timeline. -#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, parallel_jobs: usize, @@ -315,7 +315,7 @@ async fn backup_lsn_range( anyhow::bail!("parallel_jobs must be >= 1"); } - let remote_timeline_path = remote_timeline_path(&timeline.ttid)?; + let remote_timeline_path = &timeline.remote_path; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); @@ -328,11 +328,7 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment( - s, - timeline_dir, - &remote_timeline_path, - )); + uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); true } None => false, diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 675a051887..bddfca50e4 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -1,6 +1,8 @@ //! Safekeeper timeline has a background task which is subscribed to `commit_lsn` -//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` -//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! and `flush_lsn` updates. +//! +//! After the partial segment was updated (`flush_lsn` was changed), the segment +//! will be uploaded to S3 within the configured `partial_backup_timeout`. //! //! The filename format for partial segments is //! `Segment_Term_Flush_Commit_skNN.partial`, where: @@ -22,6 +24,7 @@ use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; use utils::{id::NodeId, lsn::Lsn}; @@ -31,7 +34,7 @@ use crate::{ safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, - wal_backup::{self, remote_timeline_path}, + wal_backup::{self}, SafeKeeperConf, }; @@ -145,7 +148,7 @@ impl State { } } -struct PartialBackup { +pub struct PartialBackup { wal_seg_size: usize, tli: WalResidentTimeline, conf: SafeKeeperConf, @@ -155,8 +158,25 @@ struct PartialBackup { state: State, } -// Read-only methods for getting segment names impl PartialBackup { + pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + let (_, persistent_state) = tli.get_state().await; + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.get_timeline_dir(); + let remote_timeline_path = tli.remote_path.clone(); + + PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_timeline_path, + } + } + + // Read-only methods for getting segment names fn segno(&self, lsn: Lsn) -> XLogSegNo { lsn.segment_number(self.wal_seg_size) } @@ -297,6 +317,18 @@ impl PartialBackup { Ok(()) } + // Prepend to the given segments remote prefix and delete them from the + // remote storage. + async fn delete_segments(&self, segments_to_delete: &Vec) -> anyhow::Result<()> { + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = self.remote_timeline_path.join(seg); + objects_to_delete.push(remote_path); + } + wal_backup::delete_objects(&objects_to_delete).await + } + /// Delete all non-Uploaded segments from the remote storage. There should be only one /// Uploaded segment at a time. #[instrument(name = "gc", skip_all)] @@ -329,15 +361,8 @@ impl PartialBackup { ); } - info!("deleting objects: {:?}", segments_to_delete); - let mut objects_to_delete = vec![]; - for seg in segments_to_delete.iter() { - let remote_path = self.remote_timeline_path.join(seg); - objects_to_delete.push(remote_path); - } - - // removing segments from remote storage - wal_backup::delete_objects(&objects_to_delete).await?; + // execute the deletion + self.delete_segments(&segments_to_delete).await?; // now we can update the state on disk let new_state = { @@ -349,6 +374,27 @@ impl PartialBackup { Ok(()) } + + /// Remove uploaded segment(s) from the state and remote storage. Aimed for + /// manual intervention, not normally needed. + /// Returns list of segments which potentially existed in the remote storage. + pub async fn reset(&mut self) -> anyhow::Result> { + let segments_to_delete = self + .state + .segments + .iter() + .map(|seg| seg.name.clone()) + .collect(); + + // First reset cfile state, and only then objects themselves. If the + // later fails we might leave some garbage behind; that's ok for this + // single time usage. + let new_state = State { segments: vec![] }; + self.commit_state(new_state).await?; + + self.delete_segments(&segments_to_delete).await?; + Ok(segments_to_delete) + } } /// Check if everything is uploaded and partial backup task doesn't need to run. @@ -372,38 +418,21 @@ pub(crate) fn needs_uploading( /// /// When there is nothing more to do and the last segment was successfully uploaded, the task /// returns PartialRemoteSegment, to signal readiness for offloading the timeline. -#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))] pub async fn main_task( tli: WalResidentTimeline, conf: SafeKeeperConf, limiter: RateLimiter, + cancel: CancellationToken, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; let mut first_iteration = true; - let (_, persistent_state) = tli.get_state().await; let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let wal_seg_size = tli.get_wal_seg_size().await; - let local_prefix = tli.get_timeline_dir(); - let remote_timeline_path = match remote_timeline_path(&tli.ttid) { - Ok(path) => path, - Err(e) => { - error!("failed to create remote path: {:?}", e); - return None; - } - }; - - let mut backup = PartialBackup { - wal_seg_size, - tli, - state: persistent_state.partial_backup, - conf, - local_prefix, - remote_timeline_path, - }; + let mut backup = PartialBackup::new(tli, conf).await; debug!("state: {:?}", backup.state); @@ -433,6 +462,10 @@ pub async fn main_task( && flush_lsn_rx.borrow().term == seg.term { // we have nothing to do, the last segment is already uploaded + debug!( + "exiting, uploaded up to term={} flush_lsn={} commit_lsn={}", + seg.term, seg.flush_lsn, seg.commit_lsn + ); return Some(seg.clone()); } } @@ -444,6 +477,10 @@ pub async fn main_task( info!("timeline canceled"); return None; } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } _ = flush_lsn_rx.changed() => {} } } @@ -470,6 +507,10 @@ pub async fn main_task( info!("timeline canceled"); return None; } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } _ = commit_lsn_rx.changed() => {} _ = flush_lsn_rx.changed() => { let segno = backup.segno(flush_lsn_rx.borrow().lsn); @@ -492,7 +533,13 @@ pub async fn main_task( } // limit concurrent uploads - let _upload_permit = limiter.acquire_partial_backup().await; + let _upload_permit = tokio::select! { + acq = limiter.acquire_partial_backup() => acq, + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } + }; let prepared = backup.prepare_upload().await; if let Some(seg) = &uploaded_segment { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 16f7748eb4..1ab54d4cce 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -17,6 +17,7 @@ use crate::SafeKeeperConf; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. +/// /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access /// to any tenant are allowed) or Tenant (only tokens giving access to specific /// tenant are allowed). Doesn't matter if auth is disabled in conf. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ded8571a3e..46c260901d 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -15,6 +15,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use std::cmp::{max, min}; +use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; use tokio::fs::{self, remove_file, File, OpenOptions}; @@ -35,8 +36,9 @@ use postgres_ffi::XLOG_BLCKSZ; use pq_proto::SystemId; use utils::{id::TenantTimelineId, lsn::Lsn}; -#[async_trait::async_trait] pub trait Storage { + // Last written LSN. + fn write_lsn(&self) -> Lsn; /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; @@ -44,16 +46,19 @@ pub trait Storage { /// the segment and short header at the page of given LSN. This is only used /// for timeline initialization because compute will stream data only since /// init_lsn. Other segment headers are included in compute stream. - async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>; + fn initialize_first_segment( + &mut self, + init_lsn: Lsn, + ) -> impl Future> + Send; /// Write piece of WAL from buf to disk, but not necessarily sync it. - async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> impl Future> + Send; /// Truncate WAL at specified LSN, which must be the end of WAL record. - async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + fn truncate_wal(&mut self, end_pos: Lsn) -> impl Future> + Send; /// Durably store WAL on disk, up to the last written WAL record. - async fn flush_wal(&mut self) -> Result<()>; + fn flush_wal(&mut self) -> impl Future> + Send; /// Remove all segments <= given segno. Returns function doing that as we /// want to perform it without timeline lock. @@ -93,7 +98,19 @@ pub struct PhysicalStorage { /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. write_lsn: Lsn, - /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + /// The LSN of the last WAL record written to disk. Still can be not fully + /// flushed. + /// + /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog + /// switch ingest the reverse is true because we don't bump write_lsn up to + /// the next segment: WAL stream from the compute doesn't have the gap and + /// for simplicity / as a sanity check we disallow any non-sequential + /// writes, so write zeros as is. + /// + /// Similar effect is in theory possible due to LSN alignment: if record + /// ends at *2, decoder will report end lsn as *8 even though we haven't + /// written these zeros yet. In practice compute likely never sends + /// non-aligned chunks of data. write_record_lsn: Lsn, /// The LSN of the last WAL record flushed to disk. @@ -162,8 +179,7 @@ impl PhysicalStorage { ) }; - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? + // note: this assumes we fsync'ed whole datadir on start. let flush_lsn = write_lsn; debug!( @@ -325,8 +341,11 @@ impl PhysicalStorage { } } -#[async_trait::async_trait] impl Storage for PhysicalStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn @@ -432,11 +451,12 @@ impl Storage for PhysicalStorage { .with_label_values(&["truncate_wal"]) .start_timer(); - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + // Streaming must not create a hole, so truncate cannot be called on + // non-written lsn. + if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn { bail!( - "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", - self.write_lsn, + "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}", + self.write_record_lsn, end_pos ); } diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index c2db9de78a..b854754ecf 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -83,7 +83,6 @@ impl DiskStateStorage { } } -#[async_trait::async_trait] impl control_file::Storage for DiskStateStorage { /// Persist safekeeper state on disk and update internal state. async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { @@ -175,8 +174,11 @@ impl DiskWALStorage { } } -#[async_trait::async_trait] impl wal_storage::Storage for DiskWALStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index f42262cf48..e8e0b3c23a 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -68,16 +68,29 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`) buildType = "release" - pgVersion = "14" + pgVersion = "16" } pgVersions.add(pgVersion) + // We use `arch` as it is returned by GitHub Actions + // (RUNNER_ARCH env var): X86, X64, ARM, or ARM64 + // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables + let arch = "" + if (test.parameters.includes("'X64'")) { + arch = "x86-64" + } else if (test.parameters.includes("'ARM64'")) { + arch = "arm64" + } else { + arch = "unknown" + } + // Removing build type and PostgreSQL version from the test name to make it shorter const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}` test.pgVersion = pgVersion test.buildType = buildType + test.arch = arch if (test.status === "passed") { passedTests[pgVersion][testName].push(test) @@ -144,7 +157,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` - links.push(`[${test.buildType}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } @@ -175,7 +188,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` - links.push(`[${test.buildType}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } diff --git a/scripts/coverage b/scripts/coverage index 52a69c93b9..482dc58ff6 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -134,7 +134,7 @@ class LLVM: # Show a user-friendly warning raise Exception(' '.join([ f"It appears that you don't have `{name}` installed.", - "Please execute `rustup component add llvm-tools-preview`,", + "Please execute `rustup component add llvm-tools`,", "or install it via your package manager of choice.", "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.", ])) @@ -518,7 +518,7 @@ def main() -> None: example = f""" prerequisites: # alternatively, install a system package for `llvm-tools` - rustup component add llvm-tools-preview + rustup component add llvm-tools self-contained example: {app} run make diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index ac4b00669e..82ec0aa272 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -10,7 +10,6 @@ bench = [] [dependencies] anyhow.workspace = true async-stream.workspace = true -bytes.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true futures.workspace = true @@ -24,7 +23,6 @@ parking_lot.workspace = true prost.workspace = true tonic.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } -tokio-stream.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index ecaac04915..a96d64e096 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -15,9 +15,7 @@ testing = [] [dependencies] anyhow.workspace = true -aws-config.workspace = true bytes.workspace = true -camino.workspace = true chrono.workspace = true clap.workspace = true fail.workspace = true diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml index c3bfe2bfd2..9fa89176af 100644 --- a/storage_controller/client/Cargo.toml +++ b/storage_controller/client/Cargo.toml @@ -5,19 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] -pageserver_api.workspace = true pageserver_client.workspace = true -thiserror.workspace = true -async-trait.workspace = true reqwest.workspace = true -utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } -tokio-postgres.workspace = true -tokio-stream.workspace = true -tokio.workspace = true -futures.workspace = true -tokio-util.workspace = true -anyhow.workspace = true -postgres.workspace = true -bytes.workspace = true diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql new file mode 100644 index 0000000000..9dfc750586 --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql @@ -0,0 +1,2 @@ +-- This file should undo anything in `up.sql` +DROP TABLE safekeepers; diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql new file mode 100644 index 0000000000..c78716660f --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql @@ -0,0 +1,15 @@ +-- started out as a copy of cplane schema, removed the unnecessary columns. +CREATE TABLE safekeepers ( + -- the surrogate identifier defined by control plane database sequence + id BIGINT PRIMARY KEY, + region_id TEXT NOT NULL, + version BIGINT NOT NULL, + -- the natural id on whatever cloud platform, not needed in storage controller + -- instance_id TEXT UNIQUE NOT NULL, + host TEXT NOT NULL, + port INTEGER NOT NULL, + active BOOLEAN NOT NULL DEFAULT false, + -- projects_count INTEGER NOT NULL DEFAULT 0, + http_port INTEGER NOT NULL, + availability_zone_id TEXT NOT NULL +); diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql new file mode 100644 index 0000000000..518c747100 --- /dev/null +++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql @@ -0,0 +1,2 @@ +-- This file should undo anything in `up.sql` +DROP INDEX tenant_shards_tenant_id; \ No newline at end of file diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql new file mode 100644 index 0000000000..dd6b37781a --- /dev/null +++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql @@ -0,0 +1,2 @@ +-- Your SQL goes here +CREATE INDEX tenant_shards_tenant_id ON tenant_shards (tenant_id); \ No newline at end of file diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql new file mode 100644 index 0000000000..22df81c83c --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP availability_zone_id; diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql new file mode 100644 index 0000000000..7112f92bf2 --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD availability_zone_id VARCHAR; diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql new file mode 100644 index 0000000000..4fcb928533 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL; diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql new file mode 100644 index 0000000000..c5b4534087 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql new file mode 100644 index 0000000000..127972a2e4 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards DROP preferred_az_id; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql new file mode 100644 index 0000000000..641a54feb2 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index c0e27bafdb..b7e66d33eb 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -6,10 +6,7 @@ use std::{ }; use tokio_util::sync::CancellationToken; -use pageserver_api::{ - controller_api::{NodeAvailability, UtilizationScore}, - models::PageserverUtilization, -}; +use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; use thiserror::Error; use utils::id::NodeId; @@ -147,7 +144,8 @@ impl HeartbeaterTask { // goes through to the pageserver even when the node is marked offline. // This doesn't impact the availability observed by [`crate::service::Service`]. let mut node_clone = node.clone(); - node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + node_clone + .set_availability(NodeAvailability::Active(PageserverUtilization::full())); async move { let response = node_clone @@ -179,7 +177,7 @@ impl HeartbeaterTask { node.get_availability() { PageserverState::WarmingUp { - started_at: last_seen_at, + started_at: *last_seen_at, } } else { PageserverState::Offline diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 7bbd1541cf..1745bf5575 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,9 +1,11 @@ +use crate::http; use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; +use crate::persistence::SafekeeperPersistence; use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; @@ -13,14 +15,15 @@ use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - TenantCreateRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, }; use pageserver_api::models::{ TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, - TenantTimeTravelRequest, TimelineCreateRequest, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::mgmt_api; +use pageserver_client::{mgmt_api, BlockUnblock}; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -86,9 +89,16 @@ fn get_state(request: &Request) -> &HttpState { } /// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { +async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let reattach_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) @@ -96,20 +106,34 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE /// Pageserver calls into this before doing deletions, to confirm that it still /// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { +async fn handle_validate(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let validate_req = json_request::(&mut req).await?; let state = get_state(&req); - json_response(StatusCode::OK, state.service.validate(validate_req)) + json_response(StatusCode::OK, state.service.validate(validate_req).await?) } /// Call into this before attaching a tenant to a pageserver, to acquire a generation number /// (in the real control plane this is unnecessary, because the same program is managing /// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { +async fn handle_attach_hook(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let attach_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -123,9 +147,16 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap ) } -async fn handle_inspect(mut req: Request) -> Result, ApiError> { +async fn handle_inspect(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let inspect_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -135,10 +166,17 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr async fn handle_tenant_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( @@ -149,11 +187,18 @@ async fn handle_tenant_create( async fn handle_tenant_location_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, @@ -165,10 +210,17 @@ async fn handle_tenant_location_config( async fn handle_tenant_config_set( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) @@ -181,16 +233,30 @@ async fn handle_tenant_config_get( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) } async fn handle_tenant_time_travel_remote_storage( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let time_travel_req = json_request::(&mut req).await?; let timestamp_raw = must_get_query_param(&req, "travel_to")?; @@ -231,6 +297,13 @@ async fn handle_tenant_secondary_download( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; json_response(map_reqwest_hyper_status(status)?, progress) } @@ -242,6 +315,13 @@ async fn handle_tenant_delete( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let status_code = service .tenant_delete(tenant_id) .await @@ -257,11 +337,18 @@ async fn handle_tenant_delete( async fn handle_tenant_timeline_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( StatusCode::CREATED, @@ -276,9 +363,16 @@ async fn handle_tenant_timeline_delete( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; // For timeline deletions, which both implement an "initially return 202, then 404 once // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. @@ -334,14 +428,46 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_archival_config( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let create_req = json_request::(&mut req).await?; + + service + .tenant_timeline_archival_config(tenant_id, timeline_id, create_req) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_detach_ancestor( service: Arc, req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; let res = service .tenant_timeline_detach_ancestor(tenant_id, timeline_id) @@ -350,6 +476,23 @@ async fn handle_tenant_timeline_detach_ancestor( json_response(StatusCode::OK, res) } +async fn handle_tenant_timeline_block_unblock_gc( + service: Arc, + req: Request, + dir: BlockUnblock, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + service + .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -357,6 +500,13 @@ async fn handle_tenant_timeline_passthrough( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let Some(path) = req.uri().path_and_query() else { // This should never happen, our request router only calls us if there is a path return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); @@ -424,9 +574,17 @@ async fn handle_tenant_locate( service: Arc, req: Request, ) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Admin)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } @@ -437,6 +595,14 @@ async fn handle_tenant_describe( check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } @@ -446,12 +612,26 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_list()) } -async fn handle_node_register(mut req: Request) -> Result, ApiError> { +async fn handle_node_register(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let register_req = json_request::(&mut req).await?; let state = get_state(&req); state.service.node_register(register_req).await?; @@ -461,6 +641,13 @@ async fn handle_node_register(mut req: Request) -> Result, async fn handle_node_list(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let nodes = state.service.node_list().await?; let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); @@ -471,6 +658,13 @@ async fn handle_node_list(req: Request) -> Result, ApiError async fn handle_node_drop(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_drop(node_id).await?) @@ -479,14 +673,28 @@ async fn handle_node_drop(req: Request) -> Result, ApiError async fn handle_node_delete(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_delete(node_id).await?) } -async fn handle_node_configure(mut req: Request) -> Result, ApiError> { +async fn handle_node_configure(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let node_id: NodeId = parse_request_param(&req, "node_id")?; let config_req = json_request::(&mut req).await?; if node_id != config_req.node_id { @@ -512,6 +720,13 @@ async fn handle_node_configure(mut req: Request) -> Result, async fn handle_node_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -520,9 +735,27 @@ async fn handle_node_status(req: Request) -> Result, ApiErr json_response(StatusCode::OK, node_status) } +async fn handle_node_shards(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + let node_status = state.service.get_node_shards(node_id).await?; + + json_response(StatusCode::OK, node_status) +} + async fn handle_get_leader(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let leader = state.service.get_leader().await.map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!( @@ -536,6 +769,13 @@ async fn handle_get_leader(req: Request) -> Result, ApiErro async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -547,6 +787,13 @@ async fn handle_node_drain(req: Request) -> Result, ApiErro async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -558,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request) -> Result, async fn handle_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -569,6 +823,13 @@ async fn handle_node_fill(req: Request) -> Result, ApiError async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -577,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { +async fn handle_metadata_health_update(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Scrubber)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -593,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; @@ -605,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy( } async fn handle_metadata_health_list_outdated( - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let list_outdated_req = json_request::(&mut req).await?; let state = get_state(&req); let health_records = state @@ -624,10 +906,17 @@ async fn handle_metadata_health_list_outdated( async fn handle_tenant_shard_split( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let split_req = json_request::(&mut req).await?; @@ -639,10 +928,17 @@ async fn handle_tenant_shard_split( async fn handle_tenant_shard_migrate( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; let migrate_req = json_request::(&mut req).await?; json_response( @@ -653,9 +949,16 @@ async fn handle_tenant_shard_migrate( ) } -async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -669,26 +972,68 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let azs_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.update_shards_preferred_azs(azs_req).await?, + ) +} + async fn handle_step_down(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.step_down().await) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response( @@ -700,6 +1045,13 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.tenants_dump() } @@ -707,6 +1059,13 @@ async fn handle_tenants_dump(req: Request) -> Result, ApiEr async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.scheduler_dump() } @@ -714,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request) -> Result, Api async fn handle_consistency_check(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.consistency_check().await?) @@ -722,19 +1088,40 @@ async fn handle_consistency_check(req: Request) -> Result, async fn handle_reconcile_all(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.reconcile_all_now().await?) } /// Status endpoint is just used for checking that our HTTP listener is up -async fn handle_status(_req: Request) -> Result, ApiError> { +async fn handle_status(req: Request) -> Result, ApiError> { + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, ()) } /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling /// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. async fn handle_ready(req: Request) -> Result, ApiError> { + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); if state.service.startup_complete.is_ready() { json_response(StatusCode::OK, ()) @@ -749,6 +1136,69 @@ impl From for ApiError { } } +/// Return the safekeeper record by instance id, or 404. +/// +/// Not used by anything except manual testing. +async fn handle_get_safekeeper(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let id = parse_request_param::(&req, "id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + + let res = state.service.get_safekeeper(id).await; + + match res { + Ok(b) => json_response(StatusCode::OK, b), + Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => { + Err(ApiError::NotFound("unknown instance_id".into())) + } + Err(other) => Err(other.into()), + } +} + +/// Used as part of deployment scripts. +/// +/// Assumes information is only relayed to storage controller after first selecting an unique id on +/// control plane database, which means we have an id field in the request and payload. +async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let body = json_request::(&mut req).await?; + let id = parse_request_param::(&req, "id")?; + + if id != body.id { + // it should be repeated + return Err(ApiError::BadRequest(anyhow::anyhow!( + "id mismatch: url={id:?}, body={:?}", + body.id + ))); + } + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + + state.service.upsert_safekeeper(body).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty()) + .unwrap()) +} + /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. async fn tenant_service_handler( @@ -817,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware< let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, - LeadershipStatus::SteppedDown => { - // TODO: does it make sense to allow /status here? - AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::SteppedDown => AllowedRoutes::All, LeadershipStatus::Candidate => { AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) } @@ -897,6 +1344,13 @@ fn epilogue_metrics_middleware pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() @@ -924,6 +1378,220 @@ where request_span(request, handler).await } +enum ForwardOutcome { + Forwarded(Result, ApiError>), + NotForwarded(Request), +} + +/// Potentially forward the request to the current storage controler leader. +/// More specifically we forward when: +/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state +/// 3. There is a leader in the database to forward to +/// 4. Leader from step (3) is not the current instance +/// +/// Why forward? +/// It turns out that we can't rely on external orchestration to promptly route trafic to the +/// new leader. This is downtime inducing. Forwarding provides a safe way out. +/// +/// Why is it safe? +/// If a storcon instance is persisted in the database, then we know that it is the current leader. +/// There's one exception: time between handling step-down request and the new leader updating the +/// database. +/// +/// Let's treat the happy case first. The stepped down node does not produce any side effects, +/// since all request handling happens on the leader. +/// +/// As for the edge case, we are guaranteed to always have a maximum of two running instances. +/// Hence, if we are in the edge case scenario the leader persisted in the database is the +/// stepped down instance that received the request. Condition (4) above covers this scenario. +async fn maybe_forward(req: Request) -> ForwardOutcome { + const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + + let uri = req.uri().to_string(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + + let leader = state.service.get_leader().await; + let leader = { + match leader { + Ok(Some(leader)) => leader, + Ok(None) => { + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "No leader to forward to while in stepped down state".into(), + ))); + } + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to get leader for forwarding while in stepped down state: {err}" + ), + ))); + } + } + }; + + let cfg = state.service.get_config(); + if let Some(ref self_addr) = cfg.address_for_peers { + let leader_addr = match Uri::from_str(leader.address.as_str()) { + Ok(uri) => uri, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), + ))); + } + }; + + if *self_addr == leader_addr { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Leader is stepped down instance" + )))); + } + } + + tracing::info!("Forwarding {} to leader at {}", uri, leader.address); + + // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and + // include some leeway to get the timeout for proxied requests. + const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); + let client = reqwest::ClientBuilder::new() + .timeout(PROXIED_REQUEST_TIMEOUT) + .build(); + let client = match client { + Ok(client) => client, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to build leader client for forwarding while in stepped down state: {err}" + )))); + } + }; + + let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to convert request for forwarding while in stepped down state: {err}" + )))); + } + }; + + let response = match client.execute(request).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to forward while in stepped down state: {err}" + )))); + } + }; + + ForwardOutcome::Forwarded(convert_response(response).await) +} + +/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// Ideally, we would not have to do this since both types use the http crate +/// under the hood. However, they use different versions of the crate and keeping +/// second order dependencies in sync is difficult. +async fn convert_response(resp: reqwest::Response) -> Result, ApiError> { + use std::str::FromStr; + + let mut builder = hyper::Response::builder().status(resp.status().as_u16()); + for (key, value) in resp.headers().into_iter() { + let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + builder = builder.header(key, value); + } + + let body = http::Body::wrap_stream(resp.bytes_stream()); + + builder.body(body).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + }) +} + +/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// See [`convert_response`] for why we are doing it this way. +async fn convert_request( + req: hyper::Request, + client: &reqwest::Client, + to_address: String, +) -> Result { + use std::str::FromStr; + + let (parts, body) = req.into_parts(); + let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let path_and_query = parts.uri.path_and_query().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Request conversion failed: no path and query" + )) + })?; + + let uri = reqwest::Url::from_str( + format!( + "{}{}", + to_address.trim_end_matches("/"), + path_and_query.as_str() + ) + .as_str(), + ) + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let mut headers = reqwest::header::HeaderMap::new(); + for (key, value) in parts.headers.into_iter() { + let key = match key { + Some(k) => k, + None => { + continue; + } + }; + + let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + headers.insert(key, value); + } + + let body = hyper::body::to_bytes(body).await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + client + .request(method, uri) + .headers(headers) + .body(body) + .build() + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + }) +} + pub fn make_router( service: Arc, auth: Option>, @@ -1029,6 +1697,13 @@ pub fn make_router( .get("/control/v1/node/:node_id", |r| { named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) }) + .get("/control/v1/node/:node_id/shards", |r| { + named_request_span( + r, + handle_node_shards, + RequestName("control_v1_node_describe"), + ) + }) .get("/control/v1/leader", |r| { named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) }) @@ -1074,7 +1749,6 @@ pub fn make_router( RequestName("control_v1_metadata_health_list_outdated"), ) }) - // TODO(vlad): endpoint for cancelling drain and fill // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { tenant_service_handler( @@ -1107,9 +1781,23 @@ pub fn make_router( RequestName("control_v1_tenant_policy"), ) }) + .put("/control/v1/preferred_azs", |r| { + named_request_span( + r, + handle_update_preferred_azs, + RequestName("control_v1_preferred_azs"), + ) + }) .put("/control/v1/step_down", |r| { named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) }) + .get("/control/v1/safekeeper/:id", |r| { + named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper")) + }) + .post("/control/v1/safekeeper/:id", |r| { + // id is in the body + named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper")) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -1161,6 +1849,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_archival_config, + RequestName("v1_tenant_timeline_archival_config"), + ) + }, + ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", |r| { @@ -1171,6 +1869,26 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc", + |r| { + tenant_service_handler( + r, + |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block), + RequestName("v1_tenant_timeline_block_unblock_gc"), + ) + }, + ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc", + |r| { + tenant_service_handler( + r, + |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock), + RequestName("v1_tenant_timeline_block_unblock_gc"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index e3f29b84e7..00e90f4467 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, + MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -104,6 +104,10 @@ struct Cli { // a pageserver #[arg(long)] max_secondary_lag_bytes: Option, + + // Period with which to send heartbeats to registered nodes + #[arg(long)] + heartbeat_interval: Option, } enum StrictMode { @@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> { split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, + heartbeat_interval: args + .heartbeat_interval + .map(humantime::Duration::into) + .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index ea765ca123..cb9ce10d23 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -36,6 +36,8 @@ pub(crate) struct Node { listen_pg_addr: String, listen_pg_port: u16, + availability_zone_id: String, + // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -61,6 +63,11 @@ impl Node { self.id } + #[allow(unused)] + pub(crate) fn get_availability_zone_id(&self) -> &str { + self.availability_zone_id.as_str() + } + pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { self.scheduling } @@ -77,6 +84,7 @@ impl Node { && self.listen_http_port == register_req.listen_http_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port + && self.availability_zone_id == register_req.availability_zone_id } /// For a shard located on this node, populate a response object @@ -92,15 +100,15 @@ impl Node { } } - pub(crate) fn get_availability(&self) -> NodeAvailability { - self.availability + pub(crate) fn get_availability(&self) -> &NodeAvailability { + &self.availability } pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { use AvailabilityTransition::*; use NodeAvailability::WarmingUp; - match self.get_availability_transition(availability) { + match self.get_availability_transition(&availability) { ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation @@ -115,8 +123,8 @@ impl Node { Unchanged | ToWarmingUpFromOffline => {} } - if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) { - self.availability = WarmingUp(std::cmp::max(crnt, proposed)); + if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) { + self.availability = WarmingUp(std::cmp::max(*crnt, *proposed)); } else { self.availability = availability; } @@ -126,12 +134,12 @@ impl Node { /// into a description of the transition. pub(crate) fn get_availability_transition( &self, - availability: NodeAvailability, + availability: &NodeAvailability, ) -> AvailabilityTransition { use AvailabilityTransition::*; use NodeAvailability::*; - match (self.availability, availability) { + match (&self.availability, availability) { (Offline, Active(_)) => ToActive, (Active(_), Offline) => ToOffline, (Active(_), WarmingUp(_)) => ToWarmingUpFromActive, @@ -153,15 +161,15 @@ impl Node { /// Is this node elegible to have work scheduled onto it? pub(crate) fn may_schedule(&self) -> MaySchedule { - let score = match self.availability { - NodeAvailability::Active(score) => score, + let utilization = match &self.availability { + NodeAvailability::Active(u) => u.clone(), NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No, }; match self.scheduling { - NodeSchedulingPolicy::Active => MaySchedule::Yes(score), + NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization), NodeSchedulingPolicy::Draining => MaySchedule::No, - NodeSchedulingPolicy::Filling => MaySchedule::Yes(score), + NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization), NodeSchedulingPolicy::Pause => MaySchedule::No, NodeSchedulingPolicy::PauseForRestart => MaySchedule::No, } @@ -173,6 +181,7 @@ impl Node { listen_http_port: u16, listen_pg_addr: String, listen_pg_port: u16, + availability_zone_id: String, ) -> Self { Self { id, @@ -182,6 +191,7 @@ impl Node { listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, + availability_zone_id, cancel: CancellationToken::new(), } } @@ -194,6 +204,7 @@ impl Node { listen_http_port: self.listen_http_port as i32, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, + availability_zone_id: self.availability_zone_id.clone(), } } @@ -208,6 +219,7 @@ impl Node { listen_http_port: np.listen_http_port as u16, listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, + availability_zone_id: np.availability_zone_id, cancel: CancellationToken::new(), } } @@ -285,7 +297,7 @@ impl Node { pub(crate) fn describe(&self) -> NodeDescribeResponse { NodeDescribeResponse { id: self.id, - availability: self.availability.into(), + availability: self.availability.clone().into(), scheduling: self.scheduling, listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 8d64201cd9..961a1f78dd 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -2,12 +2,15 @@ use pageserver_api::{ models::{ detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, - TopTenantShardsRequest, TopTenantShardsResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; -use pageserver_client::mgmt_api::{Client, Result}; +use pageserver_client::{ + mgmt_api::{Client, Result}, + BlockUnblock, +}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; @@ -227,6 +230,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + measured_request!( + "timeline_archival_config", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_archival_config(tenant_shard_id, timeline_id, req) + .await + ) + } + pub(crate) async fn timeline_detach_ancestor( &self, tenant_shard_id: TenantShardId, @@ -242,6 +261,24 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_block_unblock_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<()> { + // measuring these makes no sense because we synchronize with the gc loop and remote + // storage on block_gc so there should be huge outliers + measured_request!( + "timeline_block_unblock_gc", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 16df19026c..1dc1040d96 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -8,6 +8,7 @@ use self::split_state::SplitState; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use itertools::Itertools; use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; @@ -91,6 +92,8 @@ pub(crate) enum DatabaseOperation { Detach, ReAttach, IncrementGeneration, + TenantGenerations, + ShardGenerations, ListTenantShards, InsertTenantShards, UpdateTenantShard, @@ -102,6 +105,7 @@ pub(crate) enum DatabaseOperation { ListMetadataHealthOutdated, GetLeader, UpdateLeader, + SetPreferredAzs, } #[must_use] @@ -120,6 +124,13 @@ pub(crate) enum TenantFilter { Shard(TenantShardId), } +/// Represents the results of looking up generation+pageserver for the shards of a tenant +pub(crate) struct ShardGenerationState { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) generation_pageserver: Option, +} + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -502,6 +513,100 @@ impl Persistence { Ok(Generation::new(g as u32)) } + /// When we want to call out to the running shards for a tenant, e.g. during timeline CRUD operations, + /// we need to know where the shard is attached, _and_ the generation, so that we can re-check the generation + /// afterwards to confirm that our timeline CRUD operation is truly persistent (it must have happened in the + /// latest generation) + /// + /// If the tenant doesn't exist, an empty vector is returned. + /// + /// Output is sorted by shard number + pub(crate) async fn tenant_generations( + &self, + filter_tenant_id: TenantId, + ) -> Result, DatabaseError> { + use crate::schema::tenant_shards::dsl::*; + let rows = self + .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| { + let result = tenant_shards + .filter(tenant_id.eq(filter_tenant_id.to_string())) + .select(TenantShardPersistence::as_select()) + .order(shard_number) + .load(conn)?; + Ok(result) + }) + .await?; + + Ok(rows + .into_iter() + .map(|p| ShardGenerationState { + tenant_shard_id: p + .get_tenant_shard_id() + .expect("Corrupt tenant shard id in database"), + generation: p.generation.map(|g| Generation::new(g as u32)), + generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)), + }) + .collect()) + } + + /// Read the generation number of specific tenant shards + /// + /// Output is unsorted. Output may not include values for all inputs, if they are missing in the database. + pub(crate) async fn shard_generations( + &self, + mut tenant_shard_ids: impl Iterator, + ) -> Result)>, DatabaseError> { + let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0); + + // We will chunk our input to avoid composing arbitrarily long `IN` clauses. Typically we are + // called with a single digit number of IDs, but in principle we could be called with tens + // of thousands (all the shards on one pageserver) from the generation validation API. + loop { + // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly + // large query strings. + let chunk_ids = tenant_shard_ids.by_ref().take(32); + + // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count) + let in_clause = chunk_ids + .map(|tsid| { + format!( + "('{}', {}, {})", + tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0 + ) + }) + .join(","); + + // We are done when our iterator gives us nothing to filter on + if in_clause.is_empty() { + break; + } + + let chunk_rows = self + .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| { + // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because + // the inputs are strongly typed and cannot carry any user-supplied raw string content. + let result : Vec = diesel::sql_query( + format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() + ).load(conn)?; + + Ok(result) + }) + .await?; + rows.extend(chunk_rows.into_iter()) + } + + Ok(rows + .into_iter() + .map(|tsp| { + ( + tsp.get_tenant_shard_id() + .expect("Bad tenant ID in database"), + tsp.generation.map(|g| Generation::new(g as u32)), + ) + }) + .collect()) + } + #[allow(non_local_definitions)] /// For use when updating a persistent property of a tenant, such as its config or placement_policy. /// @@ -560,6 +665,33 @@ impl Persistence { Ok(()) } + pub(crate) async fn set_tenant_shard_preferred_azs( + &self, + preferred_azs: Vec<(TenantShardId, String)>, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + + self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { + let mut shards_updated = Vec::default(); + + for (tenant_shard_id, preferred_az) in preferred_azs.iter() { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(preferred_az_id.eq(preferred_az)) + .execute(conn)?; + + if updated == 1 { + shards_updated.push((*tenant_shard_id, preferred_az.clone())); + } + } + + Ok(shards_updated) + }) + .await + } + pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::Detach, move |conn| { @@ -868,10 +1000,54 @@ impl Persistence { Ok(()) } + + pub(crate) async fn safekeeper_get( + &self, + id: i64, + ) -> Result { + use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; + self.with_conn(move |conn| -> DatabaseResult { + Ok(safekeepers + .filter(id_column.eq(&id)) + .select(SafekeeperPersistence::as_select()) + .get_result(conn)?) + }) + .await + } + + pub(crate) async fn safekeeper_upsert( + &self, + record: SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| -> DatabaseResult<()> { + let bind = record.as_insert_or_update(); + + let inserted_updated = diesel::insert_into(safekeepers) + .values(&bind) + .on_conflict(id) + .do_update() + .set(&bind) + .execute(conn)?; + + if inserted_updated != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))); + } + + Ok(()) + }) + .await + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably -#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[derive( + QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq, +)] #[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { #[serde(default)] @@ -902,6 +1078,11 @@ pub(crate) struct TenantShardPersistence { pub(crate) config: String, #[serde(default)] pub(crate) scheduling_policy: String, + + // Hint that we should attempt to schedule this tenant shard the given + // availability zone in order to minimise the chances of cross-AZ communication + // with compute. + pub(crate) preferred_az_id: Option, } impl TenantShardPersistence { @@ -936,6 +1117,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_http_port: i32, pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, + pub(crate) availability_zone_id: String, } /// Tenant metadata health status that are stored durably. @@ -1002,3 +1184,47 @@ pub(crate) struct ControllerPersistence { pub(crate) address: String, pub(crate) started_at: chrono::DateTime, } + +#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)] +#[diesel(table_name = crate::schema::safekeepers)] +pub(crate) struct SafekeeperPersistence { + pub(crate) id: i64, + pub(crate) region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub(crate) version: i64, + pub(crate) host: String, + pub(crate) port: i32, + pub(crate) active: bool, + pub(crate) http_port: i32, + pub(crate) availability_zone_id: String, +} + +impl SafekeeperPersistence { + fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> { + InsertUpdateSafekeeper { + id: self.id, + region_id: &self.region_id, + version: self.version, + host: &self.host, + port: self.port, + active: self.active, + http_port: self.http_port, + availability_zone_id: &self.availability_zone_id, + } + } +} + +#[derive(Insertable, AsChangeset)] +#[diesel(table_name = crate::schema::safekeepers)] +struct InsertUpdateSafekeeper<'a> { + id: i64, + region_id: &'a str, + version: i64, + host: &'a str, + port: i32, + active: bool, + http_port: i32, + availability_zone_id: &'a str, +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 94db879ade..83b7b2b4f2 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -12,10 +12,12 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::backoff::exponential_backoff; use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; +use utils::pausable_failpoint; use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; @@ -568,6 +570,7 @@ impl Reconciler { // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. + let mut notify_attempts = 0; while let Err(e) = self.compute_notify().await { match e { NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), @@ -578,8 +581,21 @@ impl Reconciler { ); } } + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; } + pausable_failpoint!("reconciler-live-migrate-post-notify"); + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then // this location will be deleted in the general case reconciliation that runs after this. let origin_secondary_conf = build_location_config( diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 843159010d..deb5f27226 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,6 +1,6 @@ use crate::{node::Node, tenant_shard::TenantShard}; use itertools::Itertools; -use pageserver_api::controller_api::UtilizationScore; +use pageserver_api::models::PageserverUtilization; use serde::Serialize; use std::collections::HashMap; use utils::{http::error::ApiError, id::NodeId}; @@ -20,9 +20,9 @@ impl From for ApiError { } } -#[derive(Serialize, Eq, PartialEq)] +#[derive(Serialize)] pub enum MaySchedule { - Yes(UtilizationScore), + Yes(PageserverUtilization), No, } @@ -282,6 +282,28 @@ impl Scheduler { node.shard_count -= 1; } } + + // Maybe update PageserverUtilization + match update { + RefCountUpdate::AddSecondary | RefCountUpdate::Attach => { + // Referencing the node: if this takes our shard_count above the utilzation structure's + // shard count, then artifically bump it: this ensures that the scheduler immediately + // recognizes that this node has more work on it, without waiting for the next heartbeat + // to update the utilization. + if let MaySchedule::Yes(utilization) = &mut node.may_schedule { + utilization.adjust_shard_count_max(node.shard_count as u32); + } + } + RefCountUpdate::PromoteSecondary + | RefCountUpdate::Detach + | RefCountUpdate::RemoveSecondary + | RefCountUpdate::DemoteAttached => { + // De-referencing the node: leave the utilization's shard_count at a stale higher + // value until some future heartbeat after we have physically removed this shard + // from the node: this prevents the scheduler over-optimistically trying to schedule + // more work onto the node before earlier detaches are done. + } + } } // Check if the number of shards attached to a given node is lagging below @@ -326,7 +348,18 @@ impl Scheduler { use std::collections::hash_map::Entry::*; match self.nodes.entry(node.get_id()) { Occupied(mut entry) => { - entry.get_mut().may_schedule = node.may_schedule(); + // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values + // to account for any shards scheduled on the controller but not yet visible to the pageserver. + let mut may_schedule = node.may_schedule(); + match &mut may_schedule { + MaySchedule::Yes(utilization) => { + utilization.adjust_shard_count_max(entry.get().shard_count as u32); + } + MaySchedule::No => { // Nothing to tweak + } + } + + entry.get_mut().may_schedule = may_schedule; } Vacant(entry) => { entry.insert(SchedulerNode { @@ -363,7 +396,7 @@ impl Scheduler { let may_schedule = self .nodes .get(node_id) - .map(|n| n.may_schedule != MaySchedule::No) + .map(|n| !matches!(n.may_schedule, MaySchedule::No)) .unwrap_or(false); (*node_id, may_schedule) }) @@ -383,7 +416,7 @@ impl Scheduler { /// the same tenant on the same node. This is a soft constraint: the context will never /// cause us to fail to schedule a shard. pub(crate) fn schedule_shard( - &self, + &mut self, hard_exclude: &[NodeId], context: &ScheduleContext, ) -> Result { @@ -391,31 +424,41 @@ impl Scheduler { return Err(ScheduleError::NoPageservers); } - let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self + let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self .nodes - .iter() - .filter_map(|(k, v)| { - if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { - None - } else { - Some(( - *k, - context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), - v.shard_count, - v.attached_shard_count, - )) - } + .iter_mut() + .filter_map(|(k, v)| match &mut v.may_schedule { + MaySchedule::No => None, + MaySchedule::Yes(_) if hard_exclude.contains(k) => None, + MaySchedule::Yes(utilization) => Some(( + *k, + context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), + utilization.cached_score(), + v.attached_shard_count, + )), }) .collect(); + // Exclude nodes whose utilization is critically high, if there are alternatives available. This will + // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example + // we may place shards in the same tenant together on the same pageserver if all other pageservers are + // overloaded. + let non_overloaded_scores = scores + .iter() + .filter(|i| !PageserverUtilization::is_overloaded(i.2)) + .copied() + .collect::>(); + if !non_overloaded_scores.is_empty() { + scores = non_overloaded_scores; + } + // Sort by, in order of precedence: // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available - // 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with - // the least number of attached shards. - // 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes - // with the lower total shard count. + // 2nd: Utilization score (this combines shard count and disk utilization) + // 3rd: Attached shard count. When nodes have identical utilization (e.g. when populating some + // empty nodes), this acts as an anti-affinity between attached shards. // 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems. - scores.sort_by_key(|i| (i.1, i.3, i.2, i.0)); + scores.sort_by_key(|i| (i.1, i.2, i.3, i.0)); if scores.is_empty() { // After applying constraints, no pageservers were left. @@ -429,7 +472,7 @@ impl Scheduler { for (node_id, node) in &self.nodes { tracing::info!( "Node {node_id}: may_schedule={} shards={}", - node.may_schedule != MaySchedule::No, + !matches!(node.may_schedule, MaySchedule::No), node.shard_count ); } @@ -469,7 +512,7 @@ impl Scheduler { pub(crate) mod test_utils { use crate::node::Node; - use pageserver_api::controller_api::{NodeAvailability, UtilizationScore}; + use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; use std::collections::HashMap; use utils::id::NodeId; /// Test helper: synthesize the requested number of nodes, all in active state. @@ -485,8 +528,9 @@ pub(crate) mod test_utils { 80 + i as u16, format!("pghost-{i}"), 5432 + i as u16, + "test-az".to_string(), ); - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); node }) @@ -497,6 +541,8 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { + use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use super::*; use crate::tenant_shard::IntentState; @@ -557,4 +603,130 @@ mod tests { Ok(()) } + + #[test] + /// Test the PageserverUtilization's contribution to scheduling algorithm + fn scheduler_utilization() { + let mut nodes = test_utils::make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_intents = Vec::new(); + + let empty_context = ScheduleContext::default(); + + fn assert_scheduler_chooses( + expect_node: NodeId, + scheduled_intents: &mut Vec, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) { + let scheduled = scheduler.schedule_shard(&[], context).unwrap(); + let mut intent = IntentState::new(); + intent.set_attached(scheduler, Some(scheduled)); + scheduled_intents.push(intent); + assert_eq!(scheduled, expect_node); + } + + // Independent schedule calls onto empty nodes should round-robin, because each node's + // utilization's shard count is updated inline. The order is determinsitic because when all other factors are + // equal, we order by node ID. + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + + // Manually setting utilization higher should cause schedule calls to round-robin the other nodes + // which have equal utilization. + nodes + .get_mut(&NodeId(1)) + .unwrap() + .set_availability(NodeAvailability::Active(test_utilization::simple( + 10, + 1024 * 1024 * 1024, + ))); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &empty_context, + ); + + // The scheduler should prefer nodes with lower affinity score, + // even if they have higher utilization (as long as they aren't utilized at >100%) + let mut context_prefer_node1 = ScheduleContext::default(); + context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]); + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + assert_scheduler_chooses( + NodeId(1), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + + // If a node is over-utilized, it will not be used even if affinity scores prefer it + nodes + .get_mut(&NodeId(1)) + .unwrap() + .set_availability(NodeAvailability::Active(test_utilization::simple( + 20000, + 1024 * 1024 * 1024, + ))); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + assert_scheduler_chooses( + NodeId(2), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + assert_scheduler_chooses( + NodeId(3), + &mut scheduled_intents, + &mut scheduler, + &context_prefer_node1, + ); + + for mut intent in scheduled_intents { + intent.clear(&mut scheduler); + } + } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 77ba47e114..1717a9369d 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -25,6 +25,7 @@ diesel::table! { listen_http_port -> Int4, listen_pg_addr -> Varchar, listen_pg_port -> Int4, + availability_zone_id -> Varchar, } } @@ -40,7 +41,22 @@ diesel::table! { splitting -> Int2, config -> Text, scheduling_policy -> Varchar, + preferred_az_id -> Nullable, } } diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); + +diesel::table! { + safekeepers { + id -> Int8, + region_id -> Text, + version -> Int8, + instance_id -> Text, + host -> Text, + port -> Int4, + active -> Bool, + http_port -> Int4, + availability_zone_id -> Text, + } +} diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 453e96bad3..be3efaf688 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -22,10 +22,10 @@ use crate::{ peer_client::GlobalObservedState, persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - TenantFilter, + ShardGenerationState, TenantFilter, }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, - scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, + scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, ScheduleOptimizationAction, @@ -41,12 +41,16 @@ use itertools::Itertools; use pageserver_api::{ controller_api::{ MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, - NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest, + NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy, + ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, - TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, + TenantShardMigrateRequest, TenantShardMigrateResponse, + }, + models::{ + SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, + TopTenantShardsRequest, }, - models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; use reqwest::StatusCode; use tracing::{instrument, Instrument}; @@ -65,7 +69,7 @@ use pageserver_api::{ ValidateResponse, ValidateResponseTenant, }, }; -use pageserver_client::mgmt_api; +use pageserver_client::{mgmt_api, BlockUnblock}; use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; use utils::{ @@ -113,11 +117,16 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); /// How long a node may be unresponsive to heartbeats during start up before we declare it -/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// offline. +/// +/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's /// handling of the re-attach response may take a long time and blocks heartbeats from /// being handled on the pageserver side. pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +/// How often to send heartbeats to registered nodes? +pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -131,7 +140,9 @@ enum TenantOperations { TimelineCreate, TimelineDelete, AttachHook, + TimelineArchivalConfig, TimelineDetachAncestor, + TimelineGcBlockUnblock, } #[derive(Clone, strum_macros::Display)] @@ -322,6 +333,8 @@ pub struct Config { // upgraded to primary. pub max_secondary_lag_bytes: Option, + pub heartbeat_interval: Duration, + pub address_for_peers: Option, pub start_as_candidate: bool, @@ -344,6 +357,12 @@ impl From for ApiError { } } +enum InitialShardScheduleOutcome { + Scheduled(TenantCreateResponseShard), + NotScheduled, + ShardScheduleError(ScheduleError), +} + pub struct Service { inner: Arc>, config: Config, @@ -433,7 +452,7 @@ struct ShardSplitParams { // When preparing for a shard split, we may either choose to proceed with the split, // or find that the work is already done and return NoOp. enum ShardSplitAction { - Split(ShardSplitParams), + Split(Box), NoOp(TenantShardSplitResponse), } @@ -542,7 +561,7 @@ impl Service { let locked = self.inner.read().unwrap(); locked.nodes.clone() }; - let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; + let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -556,10 +575,8 @@ impl Service { // Mark nodes online if they responded to us: nodes are offline by default after a restart. let mut new_nodes = (**nodes).clone(); for (node_id, node) in new_nodes.iter_mut() { - if let Some(utilization) = nodes_online.get(node_id) { - node.set_availability(NodeAvailability::Active(UtilizationScore( - utilization.utilization_score, - ))); + if let Some(utilization) = nodes_online.remove(node_id) { + node.set_availability(NodeAvailability::Active(utilization)); scheduler.node_upsert(node); } } @@ -907,9 +924,7 @@ impl Service { async fn spawn_heartbeat_driver(&self) { self.startup_complete.clone().wait().await; - const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); - - let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + let mut interval = tokio::time::interval(self.config.heartbeat_interval); while !self.cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { } @@ -925,9 +940,9 @@ impl Service { if let Ok(deltas) = res { for (node_id, state) in deltas.0 { let new_availability = match state { - PageserverState::Available { utilization, .. } => NodeAvailability::Active( - UtilizationScore(utilization.utilization_score), - ), + PageserverState::Available { utilization, .. } => { + NodeAvailability::Active(utilization) + } PageserverState::WarmingUp { started_at } => { NodeAvailability::WarmingUp(started_at) } @@ -936,14 +951,17 @@ impl Service { // while the heartbeat round was on-going. Hence, filter out // offline transitions for WarmingUp nodes that are still within // their grace period. - if let Ok(NodeAvailability::WarmingUp(started_at)) = - self.get_node(node_id).await.map(|n| n.get_availability()) + if let Ok(NodeAvailability::WarmingUp(started_at)) = self + .get_node(node_id) + .await + .as_ref() + .map(|n| n.get_availability()) { let now = Instant::now(); - if now - started_at >= self.config.max_warming_up_interval { + if now - *started_at >= self.config.max_warming_up_interval { NodeAvailability::Offline } else { - NodeAvailability::WarmingUp(started_at) + NodeAvailability::WarmingUp(*started_at) } } else { NodeAvailability::Offline @@ -1256,6 +1274,7 @@ impl Service { 123, "".to_string(), 123, + "test_az".to_string(), ); scheduler.node_upsert(&node); @@ -1443,6 +1462,7 @@ impl Service { splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }; match self.persistence.insert_tenant_shards(vec![tsp]).await { @@ -1625,7 +1645,7 @@ impl Service { // This Node is a mutable local copy: we will set it active so that we can use its // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated // later. - node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + node.set_availability(NodeAvailability::Active(PageserverUtilization::full())); let configs = match node .with_client_retries( @@ -1845,37 +1865,74 @@ impl Service { Ok(response) } - pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse { - let locked = self.inner.read().unwrap(); + pub(crate) async fn validate( + &self, + validate_req: ValidateRequest, + ) -> Result { + // Fast in-memory check: we may reject validation on anything that doesn't match our + // in-memory generation for a shard + let in_memory_result = { + let mut in_memory_result = Vec::new(); + let locked = self.inner.read().unwrap(); + for req_tenant in validate_req.tenants { + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + tracing::info!( + "handle_validate: {}(gen {}): valid={valid} (latest {:?})", + req_tenant.id, + req_tenant.gen, + tenant_shard.generation + ); + + in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + } else { + // This is legal: for example during a shard split the pageserver may still + // have deletions in its queue from the old pre-split shard, or after deletion + // of a tenant that was busy with compaction/gc while being deleted. + tracing::info!( + "Refusing deletion validation for missing shard {}", + req_tenant.id + ); + } + } + + in_memory_result + }; + + // Database calls to confirm validity for anything that passed the in-memory check. We must do this + // in case of controller split-brain, where some other controller process might have incremented the generation. + let db_generations = self + .persistence + .shard_generations(in_memory_result.iter().filter_map(|i| { + if i.2 { + Some(&i.0) + } else { + None + } + })) + .await?; + let db_generations = db_generations.into_iter().collect::>(); let mut response = ValidateResponse { tenants: Vec::new(), }; - - for req_tenant in validate_req.tenants { - if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); - tracing::info!( - "handle_validate: {}(gen {}): valid={valid} (latest {:?})", - req_tenant.id, - req_tenant.gen, - tenant_shard.generation - ); - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid, - }); + for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() { + let valid = if valid { + let db_generation = db_generations.get(&tenant_shard_id); + db_generation == Some(&Some(validate_generation)) } else { - // After tenant deletion, we may approve any validation. This avoids - // spurious warnings on the pageserver if it has pending LSN updates - // at the point a deletion happens. - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid: true, - }); - } + // If in-memory state says it's invalid, trust that. It's always safe to fail a validation, at worst + // this prevents a pageserver from cleaning up an object in S3. + false + }; + + response.tenants.push(ValidateResponseTenant { + id: tenant_shard_id, + valid, + }) } - response + + Ok(response) } pub(crate) async fn tenant_create( @@ -1977,6 +2034,7 @@ impl Service { splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }) .collect(); @@ -2000,99 +2058,87 @@ impl Service { }; let mut schedule_context = ScheduleContext::default(); + let mut schedule_error = None; + let mut response_shards = Vec::new(); + for tenant_shard_id in create_ids { + tracing::info!("Creating shard {tenant_shard_id}..."); - let (waiters, response_shards) = { + let outcome = self + .do_initial_shard_scheduling( + tenant_shard_id, + initial_generation, + &create_req.shard_parameters, + create_req.config.clone(), + placement_policy.clone(), + &mut schedule_context, + ) + .await; + + match outcome { + InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp), + InitialShardScheduleOutcome::NotScheduled => {} + InitialShardScheduleOutcome::ShardScheduleError(err) => { + schedule_error = Some(err); + } + } + } + + let preferred_azs = { + let locked = self.inner.read().unwrap(); + response_shards + .iter() + .filter_map(|resp| { + let az_id = locked + .nodes + .get(&resp.node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((resp.shard_id, az_id)) + }) + .collect::>() + }; + + // Note that we persist the preferred AZ for the new shards separately. + // In theory, we could "peek" the scheduler to determine where the shard will + // land, but the subsequent "real" call into the scheduler might select a different + // node. Hence, we do this awkward update to keep things consistent. + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + })?; + + { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - - let mut response_shards = Vec::new(); - let mut schcedule_error = None; - - for tenant_shard_id in create_ids { - tracing::info!("Creating shard {tenant_shard_id}..."); - - use std::collections::btree_map::Entry; - match tenants.entry(tenant_shard_id) { - Entry::Occupied(mut entry) => { - tracing::info!( - "Tenant shard {tenant_shard_id} already exists while creating" - ); - - // TODO: schedule() should take an anti-affinity expression that pushes - // attached and secondary locations (independently) away frorm those - // pageservers also holding a shard for this tenant. - - entry - .get_mut() - .schedule(scheduler, &mut schedule_context) - .map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; - - if let Some(node_id) = entry.get().intent.get_attached() { - let generation = entry - .get() - .generation - .expect("Generation is set when in attached mode"); - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: *node_id, - generation: generation.into().unwrap(), - }); - } - - continue; - } - Entry::Vacant(entry) => { - let state = entry.insert(TenantShard::new( - tenant_shard_id, - ShardIdentity::from_params( - tenant_shard_id.shard_number, - &create_req.shard_parameters, - ), - placement_policy.clone(), - )); - - state.generation = initial_generation; - state.config = create_req.config.clone(); - if let Err(e) = state.schedule(scheduler, &mut schedule_context) { - schcedule_error = Some(e); - } - - // Only include shards in result if we are attaching: the purpose - // of the response is to tell the caller where the shards are attached. - if let Some(node_id) = state.intent.get_attached() { - let generation = state - .generation - .expect("Generation is set when in attached mode"); - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: *node_id, - generation: generation.into().unwrap(), - }); - } - } - }; + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } } + } - // If we failed to schedule shards, then they are still created in the controller, - // but we return an error to the requester to avoid a silent failure when someone - // tries to e.g. create a tenant whose placement policy requires more nodes than - // are present in the system. We do this here rather than in the above loop, to - // avoid situations where we only create a subset of shards in the tenant. - if let Some(e) = schcedule_error { - return Err(ApiError::Conflict(format!( - "Failed to schedule shard(s): {e}" - ))); - } + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } - let waiters = tenants + let waiters = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + tenants .range_mut(TenantShardId::tenant_range(tenant_id)) .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) - .collect::>(); - (waiters, response_shards) + .collect::>() }; Ok(( @@ -2103,6 +2149,78 @@ impl Service { )) } + /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the + /// case of a new tenant and a pre-existing one. + async fn do_initial_shard_scheduling( + &self, + tenant_shard_id: TenantShardId, + initial_generation: Option, + shard_params: &ShardParameters, + config: TenantConfig, + placement_policy: PlacementPolicy, + schedule_context: &mut ScheduleContext, + ) -> InitialShardScheduleOutcome { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + use std::collections::btree_map::Entry; + match tenants.entry(tenant_shard_id) { + Entry::Occupied(mut entry) => { + tracing::info!("Tenant shard {tenant_shard_id} already exists while creating"); + + // TODO: schedule() should take an anti-affinity expression that pushes + // attached and secondary locations (independently) away frorm those + // pageservers also holding a shard for this tenant. + + if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(err); + } + + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry + .get() + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + Entry::Vacant(entry) => { + let state = entry.insert(TenantShard::new( + tenant_shard_id, + ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params), + placement_policy, + )); + + state.generation = initial_generation; + state.config = config; + if let Err(e) = state.schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(e); + } + + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + } + } + /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded /// wait for reconciliation to complete before responding. async fn await_waiters( @@ -2473,7 +2591,7 @@ impl Service { .await; let node = { - let locked = self.inner.read().unwrap(); + let mut locked = self.inner.write().unwrap(); // Just a sanity check to prevent misuse: the API expects that the tenant is fully // detached everywhere, and nothing writes to S3 storage. Here, we verify that, // but only at the start of the process, so it's really just to prevent operator @@ -2500,7 +2618,7 @@ impl Service { return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); } } - let scheduler = &locked.scheduler; + let scheduler = &mut locked.scheduler; // Right now we only perform the operation on a single node without parallelization // TODO fan out the operation to multiple nodes for better performance let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; @@ -2853,82 +2971,134 @@ impl Service { .await; failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); - self.ensure_attached_wait(tenant_id).await?; + self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + }; + let shard_zero = targets.remove(0); - let mut targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); + async fn create_one( + tenant_shard_id: TenantShardId, + node: Node, + jwt: Option, + create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline on shard {}/{}, attached to node {node}", + tenant_shard_id, + create_req.new_timeline_id, + ); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - for (tenant_shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - let node_id = shard.intent.get_attached().ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) - })?; - let node = locked - .nodes - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); - - targets.push((*tenant_shard_id, node.clone())); + client + .timeline_create(tenant_shard_id, &create_req) + .await + .map_err(|e| passthrough_api_error(&node, e)) } - targets - }; - if targets.is_empty() { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant not found").into(), - )); - }; - let shard_zero = targets.remove(0); - - async fn create_one( - tenant_shard_id: TenantShardId, - node: Node, - jwt: Option, - create_req: TimelineCreateRequest, - ) -> Result { - tracing::info!( - "Creating timeline on shard {}/{}, attached to node {node}", - tenant_shard_id, - create_req.new_timeline_id, - ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - - client - .timeline_create(tenant_shard_id, &create_req) - .await - .map_err(|e| passthrough_api_error(&node, e)) - } - - // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then - // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard - // that will get the first creation request, and propagate the LSN to all the >0 shards. - let timeline_info = create_one( - shard_zero.0, - shard_zero.1, - self.config.jwt_token.clone(), - create_req.clone(), - ) - .await?; - - // Propagate the LSN that shard zero picked, if caller didn't provide one - if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() { - create_req.ancestor_start_lsn = timeline_info.ancestor_lsn; - } - - // Create timeline on remaining shards with number >0 - if !targets.is_empty() { - // If we had multiple shards, issue requests for the remainder now. - let jwt = &self.config.jwt_token; - self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { - let create_req = create_req.clone(); - Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) - }) + // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then + // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard + // that will get the first creation request, and propagate the LSN to all the >0 shards. + let timeline_info = create_one( + shard_zero.0, + shard_zero.1, + self.config.jwt_token.clone(), + create_req.clone(), + ) .await?; - } - Ok(timeline_info) + // Propagate the LSN that shard zero picked, if caller didn't provide one + if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() + { + create_req.ancestor_start_lsn = timeline_info.ancestor_lsn; + } + + // Create timeline on remaining shards with number >0 + if !targets.is_empty() { + // If we had multiple shards, issue requests for the remainder now. + let jwt = &self.config.jwt_token; + self.tenant_for_shards( + targets.iter().map(|t| (t.0, t.1.clone())).collect(), + |tenant_shard_id: TenantShardId, node: Node| { + let create_req = create_req.clone(); + Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) + }, + ) + .await?; + } + + Ok(timeline_info) + }) + .await? + } + + pub(crate) async fn tenant_timeline_archival_config( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline {tenant_id}/{timeline_id} to '{:?}'", + req.state + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineArchivalConfig, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + async fn config_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_archival_config(tenant_shard_id, timeline_id, &req) + .await + .map_err(|e| match e { + mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg) => { + ApiError::PreconditionFailed(msg.into_boxed_str()) + } + _ => passthrough_api_error(&node, e), + }) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(config_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + req.clone(), + )) + }) + .await?; + assert!(!results.is_empty(), "must have at least one result"); + + Ok(()) + }).await? } pub(crate) async fn tenant_timeline_detach_ancestor( @@ -2945,107 +3115,138 @@ impl Service { ) .await; - self.ensure_attached_wait(tenant_id).await?; - - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - let node_id = shard.intent.get_attached().ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) - })?; - let node = locked - .nodes - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); - - targets.push((*tenant_shard_id, node.clone())); + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); } - targets - }; - if targets.is_empty() { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant not found").into(), - )); - } + async fn detach_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { + tracing::info!( + "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); - async fn detach_one( - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - node: Node, - jwt: Option, - ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { - tracing::info!( - "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", - ); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + use mgmt_api::Error; - client - .timeline_detach_ancestor(tenant_shard_id, timeline_id) - .await - .map_err(|e| { - use mgmt_api::Error; - - match e { - // no ancestor (ever) - Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( - "{node}: {}", - msg.strip_prefix("Conflict: ").unwrap_or(&msg) - )), - // too many ancestors - Error::ApiError(StatusCode::BAD_REQUEST, msg) => { - ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + match e { + // no ancestor (ever) + Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!( + "{node}: {}", + msg.strip_prefix("Conflict: ").unwrap_or(&msg) + )), + // too many ancestors + Error::ApiError(StatusCode::BAD_REQUEST, msg) => { + ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) + } + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => { + // avoid turning these into conflicts to remain compatible with + // pageservers, 500 errors are sadly retryable with timeline ancestor + // detach + ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped as usual + other => passthrough_api_error(&node, other), } - Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => { - // avoid turning these into conflicts to remain compatible with - // pageservers, 500 errors are sadly retryable with timeline ancestor - // detach - ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}")) - } - // rest can be mapped as usual - other => passthrough_api_error(&node, other), - } + }) + .map(|res| (tenant_shard_id.shard_number, res)) + } + + // no shard needs to go first/last; the operation should be idempotent + let mut results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(detach_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) }) - .map(|res| (tenant_shard_id.shard_number, res)) - } + .await?; - // no shard needs to go first/last; the operation should be idempotent - let mut results = self - .tenant_for_shards(targets, |tenant_shard_id, node| { - futures::FutureExt::boxed(detach_one( + let any = results.pop().expect("we must have at least one response"); + + let mismatching = results + .iter() + .filter(|(_, res)| res != &any.1) + .collect::>(); + if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane + let matching = results.len() - mismatching.len(); + tracing::error!( + matching, + compared_against=?any, + ?mismatching, + "shards returned different results" + ); + + return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required."))); + } + + Ok(any.1) + }).await? + } + + pub(crate) async fn tenant_timeline_block_unblock_gc( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineGcBlockUnblock, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn do_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + dir: BlockUnblock, + ) -> Result<(), ApiError> { + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) + .await + .map_err(|e| passthrough_api_error(&node, e)) + } + + // no shard needs to go first/last; the operation should be idempotent + self.tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(do_one( tenant_shard_id, timeline_id, node, self.config.jwt_token.clone(), + dir, )) }) - .await?; - - let any = results.pop().expect("we must have at least one response"); - - let mismatching = results - .iter() - .filter(|(_, res)| res != &any.1) - .collect::>(); - if !mismatching.is_empty() { - // this can be hit by races which should not happen because operation lock on cplane - let matching = results.len() - mismatching.len(); - tracing::error!( - matching, - compared_against=?any, - ?mismatching, - "shards returned different results" - ); - - return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required."))); - } - - Ok(any.1) + .await + }) + .await??; + Ok(()) } /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. @@ -3116,6 +3317,114 @@ impl Service { results } + /// Helper for safely working with the shards in a tenant remotely on pageservers, for example + /// when creating and deleting timelines: + /// - Makes sure shards are attached somewhere if they weren't already + /// - Looks up the shards and the nodes where they were most recently attached + /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this + /// ensures that the remote operation acted on the most recent generation, and is therefore durable. + async fn tenant_remote_mutation( + &self, + tenant_id: TenantId, + op: O, + ) -> Result + where + O: FnOnce(Vec<(TenantShardId, Node)>) -> F, + F: std::future::Future, + { + let target_gens = { + let mut targets = Vec::new(); + + // Load the currently attached pageservers for the latest generation of each shard. This can + // run concurrently with reconciliations, and it is not guaranteed that the node we find here + // will still be the latest when we're done: we will check generations again at the end of + // this function to handle that. + let generations = self.persistence.tenant_generations(tenant_id).await?; + + if generations + .iter() + .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) + { + // One or more shards has not been attached to a pageserver. Check if this is because it's configured + // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) + let locked = self.inner.read().unwrap(); + for (shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.policy { + PlacementPolicy::Attached(_) => { + // This shard is meant to be attached: the caller is not wrong to try and + // use this function, but we can't service the request right now. + } + PlacementPolicy::Secondary | PlacementPolicy::Detached => { + return Err(ApiError::Conflict(format!( + "Shard {shard_id} tenant has policy {:?}", + shard.policy + ))); + } + } + } + + return Err(ApiError::ResourceUnavailable( + "One or more shards in tenant is not yet attached".into(), + )); + } + + let locked = self.inner.read().unwrap(); + for ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver, + } in generations + { + let node_id = generation_pageserver.expect("We checked for None above"); + let node = locked + .nodes + .get(&node_id) + .ok_or(ApiError::Conflict(format!( + "Raced with removal of node {node_id}" + )))?; + targets.push((tenant_shard_id, node.clone(), generation)); + } + + targets + }; + + let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect(); + let result = op(targets).await; + + // Post-check: are all the generations of all the shards the same as they were initially? This proves that + // our remote operation executed on the latest generation and is therefore persistent. + { + let latest_generations = self.persistence.tenant_generations(tenant_id).await?; + if latest_generations + .into_iter() + .map( + |ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver: _, + }| (tenant_shard_id, generation), + ) + .collect::>() + != target_gens + .into_iter() + .map(|i| (i.0, i.2)) + .collect::>() + { + // We raced with something that incremented the generation, and therefore cannot be + // confident that our actions are persistent (they might have hit an old generation). + // + // This is safe but requires a retry: ask the client to do that by giving them a 503 response. + return Err(ApiError::ResourceUnavailable( + "Tenant attachment changed, please retry".into(), + )); + } + } + + Ok(result) + } + pub(crate) async fn tenant_timeline_delete( &self, tenant_id: TenantId, @@ -3129,83 +3438,62 @@ impl Service { ) .await; - self.ensure_attached_wait(tenant_id).await?; - - let mut targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - let node_id = shard.intent.get_attached().ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) - })?; - let node = locked - .nodes - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); - - targets.push((*tenant_shard_id, node.clone())); + self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); } - targets - }; + let shard_zero = targets.remove(0); - if targets.is_empty() { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant not found").into(), - )); - } - let shard_zero = targets.remove(0); + async fn delete_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result { + tracing::info!( + "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); - async fn delete_one( - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - node: Node, - jwt: Option, - ) -> Result { - tracing::info!( - "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", - ); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_delete(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + }) + } - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - client - .timeline_delete(tenant_shard_id, timeline_id) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + let statuses = self + .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + Box::pin(delete_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), )) }) - } + .await?; - let statuses = self - .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { - Box::pin(delete_one( - tenant_shard_id, - timeline_id, - node, - self.config.jwt_token.clone(), - )) - }) + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { + return Ok(StatusCode::ACCEPTED); + } + + // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed + // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. + let shard_zero_status = delete_one( + shard_zero.0, + timeline_id, + shard_zero.1, + self.config.jwt_token.clone(), + ) .await?; - - // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero - if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { - return Ok(StatusCode::ACCEPTED); - } - - // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed - // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. - let shard_zero_status = delete_one( - shard_zero.0, - timeline_id, - shard_zero.1, - self.config.jwt_token.clone(), - ) - .await?; - - Ok(shard_zero_status) + Ok(shard_zero_status) + }).await? } /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this @@ -3346,6 +3634,7 @@ impl Service { is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), scheduling_policy: *shard.get_scheduling_policy(), + preferred_az_id: shard.preferred_az().map(ToString::to_string), }) } @@ -3949,7 +4238,7 @@ impl Service { let policy = policy.unwrap(); let config = config.unwrap(); - Ok(ShardSplitAction::Split(ShardSplitParams { + Ok(ShardSplitAction::Split(Box::new(ShardSplitParams { old_shard_count, new_shard_count: ShardCount::new(split_req.new_shard_count), new_stripe_size: split_req.new_stripe_size, @@ -3957,13 +4246,13 @@ impl Service { policy, config, shard_ident, - })) + }))) } async fn do_tenant_shard_split( &self, tenant_id: TenantId, - params: ShardSplitParams, + params: Box, ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the @@ -3979,7 +4268,7 @@ impl Service { policy, config, shard_ident, - } = params; + } = *params; // Drop any secondary locations: pageservers do not support splitting these, and in any case the // end-state for a split tenant will usually be to have secondary locations on different nodes. @@ -4049,9 +4338,10 @@ impl Service { config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, - // Scheduling policies do not carry through to children + // Scheduling policies and preferred AZ do not carry through to children scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }); } @@ -4171,6 +4461,47 @@ impl Service { let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); + // Now that we have scheduled the child shards, attempt to set their preferred AZ + // to that of the pageserver they've been attached on. + let preferred_azs = { + let locked = self.inner.read().unwrap(); + child_locations + .iter() + .filter_map(|(tid, node_id, _stripe_size)| { + let az_id = locked + .nodes + .get(node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((*tid, az_id)) + }) + .collect::>() + }; + + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + }); + + match updated { + Ok(updated) => { + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } + } + } + Err(err) => { + tracing::warn!("Failed to persist preferred AZs after split: {err}"); + } + } + // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); for (child_id, child_ps, stripe_size) in child_locations { @@ -4645,6 +4976,45 @@ impl Service { )) } + pub(crate) async fn get_node_shards( + &self, + node_id: NodeId, + ) -> Result { + let locked = self.inner.read().unwrap(); + let mut shards = Vec::new(); + for (tid, tenant) in locked.tenants.iter() { + let is_intended_secondary = match ( + tenant.intent.get_attached() == &Some(node_id), + tenant.intent.get_secondary().contains(&node_id), + ) { + (true, true) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "{} attached as primary+secondary on the same node", + tid + ))) + } + (true, false) => Some(false), + (false, true) => Some(true), + (false, false) => None, + }; + let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) = + tenant.observed.locations.get(&node_id) + { + Some(conf.secondary_conf.is_some()) + } else { + None + }; + if is_intended_secondary.is_some() || is_observed_secondary.is_some() { + shards.push(NodeShard { + tenant_shard_id: *tid, + is_intended_secondary, + is_observed_secondary, + }); + } + } + Ok(NodeShardResponse { node_id, shards }) + } + pub(crate) async fn get_leader(&self) -> DatabaseResult> { self.persistence.get_leader().await } @@ -4660,29 +5030,48 @@ impl Service { ) .await; - { + enum RegistrationStatus { + Matched, + Mismatched, + New, + } + + let registration_status = { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { - // Note that we do not do a total equality of the struct, because we don't require - // the availability/scheduling states to agree for a POST to be idempotent. if node.registration_match(®ister_req) { - tracing::info!( - "Node {} re-registered with matching address", - register_req.node_id - ); - return Ok(()); + RegistrationStatus::Matched } else { - // TODO: decide if we want to allow modifying node addresses without removing and re-adding - // the node. Safest/simplest thing is to refuse it, and usually we deploy with - // a fixed address through the lifetime of a node. - tracing::warn!( - "Node {} tried to register with different address", - register_req.node_id - ); - return Err(ApiError::Conflict( - "Node is already registered with different address".to_string(), - )); + RegistrationStatus::Mismatched } + } else { + RegistrationStatus::New + } + }; + + match registration_status { + RegistrationStatus::Matched => { + tracing::info!( + "Node {} re-registered with matching address", + register_req.node_id + ); + + return Ok(()); + } + RegistrationStatus::Mismatched => { + // TODO: decide if we want to allow modifying node addresses without removing and re-adding + // the node. Safest/simplest thing is to refuse it, and usually we deploy with + // a fixed address through the lifetime of a node. + tracing::warn!( + "Node {} tried to register with different address", + register_req.node_id + ); + return Err(ApiError::Conflict( + "Node is already registered with different address".to_string(), + )); + } + RegistrationStatus::New => { + // fallthrough } } @@ -4719,6 +5108,7 @@ impl Service { register_req.listen_http_port, register_req.listen_pg_addr, register_req.listen_pg_port, + register_req.availability_zone_id, ); // TODO: idempotency if the node already exists in the database @@ -4761,7 +5151,7 @@ impl Service { // // The transition we calculate here remains valid later in the function because we hold the op lock on the node: // nothing else can mutate its availability while we run. - let availability_transition = if let Some(input_availability) = availability { + let availability_transition = if let Some(input_availability) = availability.as_ref() { let (activate_node, availability_transition) = { let locked = self.inner.read().unwrap(); let Some(node) = locked.nodes.get(&node_id) else { @@ -4797,8 +5187,8 @@ impl Service { )); }; - if let Some(availability) = &availability { - node.set_availability(*availability); + if let Some(availability) = availability.as_ref() { + node.set_availability(availability.clone()); } if let Some(scheduling) = scheduling { @@ -5200,72 +5590,6 @@ impl Service { )) } - /// Helper for methods that will try and call pageserver APIs for - /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant - /// is attached somewhere. - fn ensure_attached_schedule( - &self, - mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, - tenant_id: TenantId, - ) -> Result, anyhow::Error> { - let mut waiters = Vec::new(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - - let mut schedule_context = ScheduleContext::default(); - for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - shard.schedule(scheduler, &mut schedule_context)?; - - // The shard's policies may not result in an attached location being scheduled: this - // is an error because our caller needs it attached somewhere. - if shard.intent.get_attached().is_none() { - return Err(anyhow::anyhow!( - "Tenant {tenant_id} not scheduled to be attached" - )); - }; - - if shard.stably_attached().is_some() { - // We do not require the shard to be totally up to date on reconciliation: we just require - // that it has been attached on the intended node. Other dirty state such as unattached secondary - // locations, or compute hook notifications can be ignored. - continue; - } - - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { - tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); - waiters.push(waiter); - } - } - Ok(waiters) - } - - async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> { - let ensure_waiters = { - let locked = self.inner.write().unwrap(); - - // Check if the tenant is splitting: in this case, even if it is attached, - // we must act as if it is not: this blocks e.g. timeline creation/deletion - // operations during the split. - for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { - if !matches!(shard.splitting, SplitState::Idle) { - return Err(ApiError::ResourceUnavailable( - "Tenant shards are currently splitting".into(), - )); - } - } - - self.ensure_attached_schedule(locked, tenant_id) - .map_err(ApiError::InternalServerError)? - }; - - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in ensure_waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; - } - - Ok(()) - } - /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler /// configuration fn maybe_reconcile_shard( @@ -6142,9 +6466,13 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - // TODO(vlad): Currently this operates on the assumption that all - // secondaries are warm. This is not always true (e.g. we just migrated the - // tenant). Take that into consideration by checking the secondary status. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + let mut tids_to_promote = self.fill_node_plan(node_id); let mut waiters = Vec::new(); @@ -6212,9 +6540,11 @@ impl Service { node_id ); - if let Some(waiter) = - self.maybe_reconcile_shard(tenant_shard, nodes) - { + if let Some(waiter) = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ) { waiters.push(waiter); } } @@ -6358,4 +6688,49 @@ impl Service { global_observed } + + pub(crate) async fn get_safekeeper( + &self, + id: i64, + ) -> Result { + self.persistence.safekeeper_get(id).await + } + + pub(crate) async fn upsert_safekeeper( + &self, + record: crate::persistence::SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + self.persistence.safekeeper_upsert(record).await + } + + pub(crate) async fn update_shards_preferred_azs( + &self, + req: ShardsPreferredAzsRequest, + ) -> Result { + let preferred_azs = req.preferred_az_ids.into_iter().collect::>(); + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred AZs: {err}" + )) + })?; + + let mut updated_in_mem_and_db = Vec::default(); + + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + let shard = locked.tenants.get_mut(&tid); + if let Some(shard) = shard { + shard.set_preferred_az(az_id); + updated_in_mem_and_db.push(tid); + } + } + + Ok(ShardsPreferredAzsResponse { + updated: updated_in_mem_and_db, + }) + } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 1fcc3c8547..cdb0633e2b 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -140,6 +140,10 @@ pub(crate) struct TenantShard { // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, + + // We should attempt to schedule this shard in the provided AZ to + // decrease chances of cross-AZ compute. + preferred_az_id: Option, } #[derive(Default, Clone, Debug, Serialize)] @@ -463,6 +467,7 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), + preferred_az_id: None, } } @@ -779,7 +784,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_secondary( &self, - scheduler: &Scheduler, + scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { if self.intent.secondary.is_empty() { @@ -1297,6 +1302,7 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + preferred_az_id: tsp.preferred_az_id, }) } @@ -1312,8 +1318,17 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), + preferred_az_id: self.preferred_az_id.clone(), } } + + pub(crate) fn preferred_az(&self) -> Option<&str> { + self.preferred_az_id.as_deref() + } + + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) { + self.preferred_az_id = Some(preferred_az_id); + } } #[cfg(test)] @@ -1595,7 +1610,7 @@ pub(crate) mod tests { schedule_context.avoid(&shard_b.intent.all_pageservers()); schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); - let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context); + let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); // Since there is a node with no locations available, the node with two locations for the // same tenant should generate an optimization to move one away diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index d19119990b..f9987662b9 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -6,21 +6,13 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true -aws-smithy-async.workspace = true either.workspace = true -tokio-rustls.workspace = true anyhow.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true -thiserror.workspace = true -rand.workspace = true -bytes.workspace = true -bincode.workspace = true -crc32c.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md index 9fbd92feef..5be8541419 100644 --- a/storage_scrubber/README.md +++ b/storage_scrubber/README.md @@ -98,7 +98,7 @@ to list timelines and find their backup and start LSNs. If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. -First, we need to group pageservers by buckets, `https:///admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed. +First, we need to group pageservers by buckets, `https:///admin/pageservers` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed. Per bucket, for every pageserver id related, find deleted tenants: diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b35838bcf7..15dfb101b5 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,6 +1,7 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use anyhow::Context; +use itertools::Itertools; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; @@ -47,6 +48,56 @@ impl TimelineAnalysis { } } +/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). +/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// +/// ```plain +/// | | | | +/// | 1 | | 2 | | 3 | +/// | | | | | | +/// ``` +/// +/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have +/// the same LSN range. +/// +/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, +/// +/// ```plain +/// | | | 2 | | | +/// | 1 | |-------| | 3 | +/// | | | 4 | | | +/// +/// If layer 2 and 4 contain the same single key, this is also a valid layer map. +fn check_valid_layermap(metadata: &HashMap) -> Option { + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) + let mut all_delta_layers = Vec::new(); + for (name, _) in metadata.iter() { + if let LayerName::Delta(layer) = name { + if layer.key_range.start.next() != layer.key_range.end { + all_delta_layers.push(layer.clone()); + } + } + } + for layer in &all_delta_layers { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } + for layer in &all_delta_layers { + let lsn_range = layer.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + layer, + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + return Some(err); + } + } + None +} + pub(crate) async fn branch_cleanup_and_check_errors( remote_client: &GenericRemoteStorage, id: &TenantShardTimelineId, @@ -126,6 +177,12 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } + if let Some(err) = check_valid_layermap(&index_part.layer_metadata) { + result.errors.push(format!( + "index_part.json contains invalid layer map structure: {err}" + )); + } + for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( @@ -150,7 +207,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( if response.is_err() { // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range()); + let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); let msg = format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 3e22960f8d..d53611ed6e 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -1,6 +1,7 @@ -//! Functionality for finding and purging garbage, as in "garbage collection". Garbage means -//! S3 objects which are either not referenced by any metadata, or are referenced by a -//! control plane tenant/timeline in a deleted state. +//! Functionality for finding and purging garbage, as in "garbage collection". +//! +//! Garbage means S3 objects which are either not referenced by any metadata, +//! or are referenced by a control plane tenant/timeline in a deleted state. use std::{ collections::{HashMap, HashSet}, diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 112f052e07..3f08cddf50 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -36,7 +36,7 @@ use serde::{Deserialize, Serialize}; use storage_controller_client::control_api; use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; -use tracing::error; +use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use utils::fs_ext; @@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>( let yield_err = if err.is_permanent() { true } else { - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; trial += 1; trial == MAX_RETRIES - 1 @@ -466,14 +466,14 @@ async fn list_objects_with_retries( return Err(e) .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); } - error!( + warn!( "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter, DisplayErrorContext(e), ); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } @@ -492,7 +492,7 @@ async fn download_object_with_retries( Ok(response) => response, Err(e) => { error!("Failed to download object for key {key}: {e}"); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; continue; } @@ -508,7 +508,7 @@ async fn download_object_with_retries( } Err(e) => { error!("Failed to stream object body for key {key}: {e}"); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 3935e513e3..ee133e2e58 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, bail}; +use anyhow::{anyhow, bail, Context}; use camino::Utf8PathBuf; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; @@ -7,6 +7,7 @@ use storage_controller_client::control_api; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; +use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; use storage_scrubber::tenant_snapshot::SnapshotDownloader; use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ @@ -76,6 +77,9 @@ enum Command { /// For safekeeper node_kind only, table in the db with debug dump #[arg(long, default_value = None)] dump_db_table: Option, + /// For safekeeper node_kind only, json list of timelines and their lsn info + #[arg(long, default_value = None)] + timeline_lsns: Option, }, TenantSnapshot { #[arg(long = "tenant-id")] @@ -117,8 +121,6 @@ enum Command { async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); - tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); - let bucket_config = BucketConfig::from_env()?; let command_log_name = match &cli.command { @@ -138,6 +140,8 @@ async fn main() -> anyhow::Result<()> { chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); + tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let controller_client = cli.controller_api.map(|controller_api| { ControllerClientConfig { controller_api, @@ -155,20 +159,22 @@ async fn main() -> anyhow::Result<()> { post_to_storcon, dump_db_connstr, dump_db_table, + timeline_lsns, } => { if let NodeKind::Safekeeper = node_kind { - let dump_db_connstr = - dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; - let dump_db_table = - dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; - - let summary = scan_safekeeper_metadata( - bucket_config.clone(), - tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), - dump_db_connstr, - dump_db_table, - ) - .await?; + let db_or_list = match (timeline_lsns, dump_db_connstr) { + (Some(timeline_lsns), _) => { + let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + DatabaseOrList::List(timeline_lsns) + } + (None, Some(dump_db_connstr)) => { + let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); + DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + } + (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + }; + let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { println!("{}", serde_json::to_string(&summary).unwrap()) } else { diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 10d77937f1..f896cff2d5 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>( } /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered -/// using a listing. The listing is done before the stream is built, so that this +/// using a listing. +/// +/// The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. pub async fn stream_tenant_timelines<'a>( remote_client: &'a GenericRemoteStorage, diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 88681e38c2..c96d9cad3b 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -440,9 +440,10 @@ async fn gc_ancestor( Ok(()) } -/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection -/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection -/// is about removing: +/// Physical garbage collection: removing unused S3 objects. +/// +/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level +/// (keys, layers). This type of garbage collection is about removing: /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between /// uploading a layer and uploading an index) /// - Index objects from historic generations diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 1a9f3d0ef5..15f3665fac 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -7,7 +7,7 @@ use postgres_ffi::{XLogFileName, PG_TLI}; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio_postgres::types::PgLsn; -use tracing::{error, info, trace}; +use tracing::{debug, error, info}; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, @@ -54,6 +54,23 @@ impl MetadataSummary { } } +#[derive(serde::Deserialize)] +pub struct TimelineLsnData { + tenant_id: String, + timeline_id: String, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +} + +pub enum DatabaseOrList { + Database { + tenant_ids: Vec, + connstr: String, + table: String, + }, + List(Vec), +} + /// Scan the safekeeper metadata in an S3 bucket, reporting errors and /// statistics. /// @@ -63,68 +80,39 @@ impl MetadataSummary { /// the project wasn't deleted in the meanwhile. pub async fn scan_safekeeper_metadata( bucket_config: BucketConfig, - tenant_ids: Vec, - dump_db_connstr: String, - dump_db_table: String, + db_or_list: DatabaseOrList, ) -> anyhow::Result { info!( - "checking bucket {}, region {}, dump_db_table {}", - bucket_config.bucket, bucket_config.region, dump_db_table + "checking bucket {}, region {}", + bucket_config.bucket, bucket_config.region ); - // Use rustls (Neon requires TLS) - let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - let client_config = rustls::ClientConfig::builder() - .with_root_certificates(root_store) - .with_no_client_auth(); - let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); - let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; - // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. - tokio::spawn(async move { - if let Err(e) = connection.await { - eprintln!("connection error: {}", e); - } - }); - - let tenant_filter_clause = if !tenant_ids.is_empty() { - format!( - "and tenant_id in ({})", - tenant_ids - .iter() - .map(|t| format!("'{}'", t)) - .collect::>() - .join(", ") - ) - } else { - "".to_owned() - }; - let query = format!( - "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", - dump_db_table, tenant_filter_clause, - ); - info!("query is {}", query); - let timelines = client.query(&query, &[]).await?; - info!("loaded {} timelines", timelines.len()); let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); - let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { - let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); - let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); - let timeline_start_lsn_pg: PgLsn = row.get(2); - let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); - let backup_lsn_pg: PgLsn = row.get(3); - let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let timelines = match db_or_list { + DatabaseOrList::Database { + tenant_ids, + connstr, + table, + } => load_timelines_from_db(tenant_ids, connstr, table).await?, + DatabaseOrList::List(list) => list, + }; + info!("loaded {} timelines", timelines.len()); + + let checks = futures::stream::iter(timelines.into_iter().map(Ok)).map_ok(|timeline| { + let tenant_id = TenantId::from_str(&timeline.tenant_id).expect("failed to parse tenant_id"); + let timeline_id = + TimelineId::from_str(&timeline.timeline_id).expect("failed to parse tenant_id"); let ttid = TenantTimelineId::new(tenant_id, timeline_id); check_timeline( &remote_client, &target, &cloud_admin_api_client, ttid, - timeline_start_lsn, - backup_lsn, + timeline.timeline_start_lsn, + timeline.backup_lsn, ) }); // Run multiple check_timeline's concurrently. @@ -163,11 +151,9 @@ async fn check_timeline( timeline_start_lsn: Lsn, backup_lsn: Lsn, ) -> anyhow::Result { - trace!( + debug!( "checking ttid {}, should contain WAL [{}-{}]", - ttid, - timeline_start_lsn, - backup_lsn + ttid, timeline_start_lsn, backup_lsn ); // calculate expected segfiles let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); @@ -177,7 +163,7 @@ async fn check_timeline( .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), ); let expected_files_num = expected_segfiles.len(); - trace!("expecting {} files", expected_segfiles.len(),); + debug!("expecting {} files", expected_segfiles.len(),); // now list s3 and check if it misses something let ttshid = @@ -252,3 +238,65 @@ fn load_certs() -> Result, std::io::Error> { Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); + +async fn load_timelines_from_db( + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result> { + info!("loading from table {dump_db_table}"); + + // Use rustls (Neon requires TLS) + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + let client_config = rustls::ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) \ + from \"{dump_db_table}\" \ + where not is_cancelled {tenant_filter_clause} \ + group by tenant_id, timeline_id;" + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + + let timelines = timelines + .into_iter() + .map(|row| { + let tenant_id = row.get(0); + let timeline_id = row.get(1); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let backup_lsn_pg: PgLsn = row.get(3); + + TimelineLsnData { + tenant_id, + timeline_id, + timeline_start_lsn: Lsn(u64::from(timeline_start_lsn_pg)), + backup_lsn: Lsn(u64::from(backup_lsn_pg)), + } + }) + .collect::>(); + Ok(timelines) +} diff --git a/test_runner/README.md b/test_runner/README.md index 73aa29d4bb..d754e60d17 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -18,8 +18,7 @@ Prerequisites: Regression tests are in the 'regress' directory. They can be run in parallel to minimize total runtime. Most regression test sets up their -environment with its own pageservers and safekeepers (but see -`TEST_SHARED_FIXTURES`). +environment with its own pageservers and safekeepers. 'pg_clients' contains tests for connecting with various client libraries. Each client test uses a Dockerfile that pulls an image that @@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries. Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` `TEST_OUTPUT`: Set the directory where test state and test output files should go. -`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: @@ -259,11 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW authentication tokens, and some other configuration options can be tweaked too. The easiest way to get access to a Neon Environment is by using the `neon_simple_env` -fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes -or make other destructive changes in that environment. Also don't assume that -there are no tenants or branches or data in the cluster. For convenience, there is a -branch called `empty`, though. The convention is to create a test-specific branch of -that and load any test data there, instead of the 'main' branch. +fixture. For convenience, there is a branch called `main` in environments created with +'neon_simple_env', ready to be used in the test. For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py new file mode 100644 index 0000000000..8ebaf61e5e --- /dev/null +++ b/test_runner/fixtures/auth_tokens.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Any + +import jwt + +from fixtures.common_types import TenantId + + +@dataclass +class AuthKeys: + priv: str + + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} + token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") + # cast(Any, self.priv) + + # jwt.encode can return 'bytes' or 'str', depending on Python version or type + # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' + # explicitly. + if isinstance(token, bytes): + token = token.decode() + + return token + + def generate_pageserver_token(self) -> str: + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) + + def generate_safekeeper_token(self) -> str: + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) + + # generate token giving access to only one tenant + def generate_tenant_token(self, tenant_id: TenantId) -> str: + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" + SCRUBBER = "scrubber" diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py deleted file mode 100644 index 8aca90a097..0000000000 --- a/test_runner/fixtures/broker.py +++ /dev/null @@ -1,63 +0,0 @@ -import subprocess -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Optional - -from fixtures.log_helper import log - - -@dataclass -class NeonBroker: - """An object managing storage_broker instance""" - - logfile: Path - port: int - neon_binpath: Path - handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon - - def listen_addr(self): - return f"127.0.0.1:{self.port}" - - def client_url(self): - return f"http://{self.listen_addr()}" - - def check_status(self): - return True # TODO - - def try_start(self): - if self.handle is not None: - log.debug(f"storage_broker is already running on port {self.port}") - return - - listen_addr = self.listen_addr() - log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') - with open(self.logfile, "wb") as logfile: - args = [ - str(self.neon_binpath / "storage_broker"), - f"--listen-addr={listen_addr}", - ] - self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) - - # wait for start - started_at = time.time() - while True: - try: - self.check_status() - except Exception as e: - elapsed = time.time() - started_at - if elapsed > 5: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" - ) from e - time.sleep(0.5) - else: - break # success - - def stop(self, immediate: bool = False): - if self.handle is not None: - if immediate: - self.handle.kill() - else: - self.handle.terminate() - self.handle.wait() diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 7cadcbb4c2..d8390138c9 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -1,5 +1,6 @@ import random from dataclasses import dataclass +from enum import Enum from functools import total_ordering from typing import Any, Dict, Type, TypeVar, Union @@ -12,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 class Lsn: """ Datatype for an LSN. Internally it is a 64-bit integer, but the string - representation is like "1/123abcd". See also pg_lsn datatype in Postgres + representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ def __init__(self, x: Union[int, str]): @@ -139,6 +140,14 @@ class TenantId(Id): return self.id.hex() +class NodeId(Id): + def __repr__(self) -> str: + return f'`NodeId("{self.id.hex()}")' + + def __str__(self) -> str: + return self.id.hex() + + class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' @@ -213,3 +222,9 @@ class TenantShardId: def __hash__(self) -> int: return hash(self._tuple()) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TimelineArchivalState(str, Enum): + ARCHIVED = "Archived" + UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 5fe544b3bd..770b32b11e 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,13 +1,17 @@ +import os +import time from abc import ABC, abstractmethod from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff +from pathlib import Path from typing import Dict, Iterator, List import pytest from _pytest.fixtures import FixtureRequest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, PgBin, @@ -102,7 +106,6 @@ class NeonCompare(PgCompare): zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, - branch_name: str, ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark @@ -110,16 +113,11 @@ class NeonCompare(PgCompare): self.pageserver_http_client = self.env.pageserver.http_client() # note that neon_simple_env now uses LOCAL_FS remote storage - - # Create tenant - tenant_conf: Dict[str, str] = {} - self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf) - - # Create timeline - self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant) + self.tenant = self.env.initial_tenant + self.timeline = self.env.initial_timeline # Start pg - self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant) + self._pg = self.env.endpoints.create_start("main", "main", self.tenant) @property def pg(self) -> PgProtocol: @@ -232,11 +230,11 @@ class VanillaCompare(PgCompare): pass # TODO find something def report_size(self): - data_size = self.pg.get_subdir_size("base") + data_size = self.pg.get_subdir_size(Path("base")) self.zenbenchmark.record( "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - wal_size = self.pg.get_subdir_size("pg_wal") + wal_size = self.pg.get_subdir_size(Path("pg_wal")) self.zenbenchmark.record( "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) @@ -297,13 +295,11 @@ class RemoteCompare(PgCompare): @pytest.fixture(scope="function") def neon_compare( - request: FixtureRequest, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, neon_simple_env: NeonEnv, ) -> NeonCompare: - branch_name = request.node.name - return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin) @pytest.fixture(scope="function") @@ -341,3 +337,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: fixture = request.getfixturevalue(request.param) assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" return fixture + + +@pytest.fixture(scope="function", autouse=True) +def sync_after_each_test(): + # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true` + # + # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`) + # that are run on self-hosted runners because some of these tests are pretty write-heavy + # and create issues to start the processes within 10s + key = "SYNC_AFTER_EACH_TEST" + enabled = os.environ.get(key) == "true" + + yield + + if not enabled: + # regress test, or running locally + return + + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync after test {elapsed=}") diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2bb698f175..cbbb162cc6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,13 +24,25 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) from urllib.parse import quote, urlparse import asyncpg import backoff import httpx -import jwt import psycopg2 import psycopg2.sql import pytest @@ -44,12 +56,11 @@ from _pytest.fixtures import FixtureRequest from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn -from typing_extensions import Literal from urllib3.util.retry import Retry from fixtures import overlayfs -from fixtures.broker import NeonBroker -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.auth_tokens import AuthKeys, TokenScope +from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics @@ -81,6 +92,7 @@ from fixtures.utils import ( allure_add_grafana_links, allure_attach_from_dir, assert_no_errors, + get_dir_size, get_self_dir, print_gc_result, subprocess_capture, @@ -90,6 +102,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore # reexport from .neon_api import NeonAPI, NeonApiEndpoint +T = TypeVar("T") + """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -144,7 +158,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: yield binpath -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): distrib_dir = Path(env_postgres_bin).resolve() @@ -168,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: yield output_dir -@pytest.fixture(scope="function") -def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: - versioned_dir = pg_distrib_dir / pg_version.v_prefixed - - psql_bin_path = versioned_dir / "bin/psql" - postgres_bin_path = versioned_dir / "bin/postgres" - - if os.getenv("REMOTE_ENV"): - # When testing against a remote server, we only need the client binary. - if not psql_bin_path.exists(): - raise Exception(f"psql not found at '{psql_bin_path}'") - else: - if not postgres_bin_path.exists(): - raise Exception(f"postgres not found at '{postgres_bin_path}'") - - log.info(f"versioned_pg_distrib_dir is {versioned_dir}") - yield versioned_dir - - @pytest.fixture(scope="session") def neon_api_key() -> str: api_key = os.getenv("NEON_API_KEY") @@ -206,33 +201,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: return NeonAPI(neon_api_key, neon_api_base_url) -def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: - """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. - - This function can be used as a scope like this: - @pytest.fixture(scope=shareable_scope) - def myfixture(...) - ... - """ - scope: Literal["session", "function"] - - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - scope = "function" - elif ( - os.environ.get("BUILD_TYPE") is not None - and os.environ.get("DEFAULT_PG_VERSION") is not None - ): - scope = "session" - else: - pytest.fail( - "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set", - pytrace=False, - ) - - return scope - - @pytest.fixture(scope="session") def worker_port_num(): return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1")) @@ -256,36 +224,11 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: return BASE_PORT + worker_seq_no * worker_port_num -def get_dir_size(path: str) -> int: - """Return size in bytes.""" - totalbytes = 0 - for root, _dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - return totalbytes - - @pytest.fixture(scope="session") def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) -@pytest.fixture(scope="function") -def default_broker( - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, -) -> Iterator[NeonBroker]: - # multiple pytest sessions could get launched in parallel, get them different ports/datadirs - client_port = port_distributor.get_port() - broker_logfile = test_output_dir / "repo" / "storage_broker.log" - - broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath) - yield broker - broker.stop() - - @pytest.fixture(scope="session") def run_id() -> Iterator[uuid.UUID]: yield uuid.uuid4() @@ -414,44 +357,6 @@ class PgProtocol: return self.safe_psql(query, log_query=log_query)[0][0] -@dataclass -class AuthKeys: - priv: str - - def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: - token_data = {key: str(val) for key, val in token_data.items()} - token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") - # cast(Any, self.priv) - - # jwt.encode can return 'bytes' or 'str', depending on Python version or type - # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' - # explicitly. - if isinstance(token, bytes): - token = token.decode() - - return token - - def generate_pageserver_token(self) -> str: - return self.generate_token(scope=TokenScope.PAGE_SERVER_API) - - def generate_safekeeper_token(self) -> str: - return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) - - # generate token giving access to only one tenant - def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) - - -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): - ADMIN = "admin" - PAGE_SERVER_API = "pageserverapi" - GENERATIONS_API = "generations_api" - SAFEKEEPER_DATA = "safekeeperdata" - TENANT = "tenant" - SCRUBBER = "scrubber" - - class NeonEnvBuilder: """ Builder object to create a Neon runtime environment @@ -466,7 +371,6 @@ class NeonEnvBuilder: self, repo_dir: Path, port_distributor: PortDistributor, - broker: NeonBroker, run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -496,6 +400,7 @@ class NeonEnvBuilder: pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, + pageserver_io_buffer_alignment: Optional[int] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -506,7 +411,6 @@ class NeonEnvBuilder: # Safekeepers remote storage self.safekeepers_remote_storage: Optional[RemoteStorage] = None - self.broker = broker self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server self.pageserver_config_override = pageserver_config_override @@ -550,6 +454,8 @@ class NeonEnvBuilder: self.storage_controller_port_override = storage_controller_port_override + self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -563,10 +469,6 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self): - assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start() - def init_start( self, initial_tenant_conf: Optional[Dict[str, Any]] = None, @@ -582,7 +484,7 @@ class NeonEnvBuilder: Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one. """ env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing) - self.start() + env.start() # Prepare the default branch to start the postgres on later. # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. @@ -740,6 +642,9 @@ class NeonEnvBuilder: patch_script = "" for ps in self.env.pageservers: patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + # This is a temporary to get the backward compat test happy + # since the compat snapshot was generated with an older version of neon local + patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}' WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;" patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines @@ -944,8 +849,11 @@ class NeonEnvBuilder: for directory_to_clean in reversed(directories_to_clean): if not os.listdir(directory_to_clean): - log.debug(f"Removing empty directory {directory_to_clean}") - directory_to_clean.rmdir() + log.info(f"Removing empty directory {directory_to_clean}") + try: + directory_to_clean.rmdir() + except Exception as e: + log.error(f"Error removing empty directory {directory_to_clean}: {e}") def cleanup_remote_storage(self): for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: @@ -1014,6 +922,8 @@ class NeonEnvBuilder: self.env.storage_controller.assert_no_errors() + self.env.broker.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1067,7 +977,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] self.pageservers: List[NeonPageserver] = [] - self.broker = config.broker + self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version @@ -1080,9 +990,6 @@ class NeonEnv: self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 self.storage_controller_config = config.storage_controller_config - - # generate initial tenant ID here instead of letting 'neon init' generate it, - # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline @@ -1123,6 +1030,7 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy + self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { @@ -1160,6 +1068,8 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, + # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` + "availability_zone": "us-east-2a", } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1184,13 +1094,11 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value + ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + # Create a corresponding NeonPageserver object self.pageservers.append( - NeonPageserver( - self, - ps_id, - port=pageserver_port, - ) + NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]) ) cfg["pageservers"].append(ps_cfg) @@ -1244,7 +1152,7 @@ class NeonEnv: max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: futs.append( - executor.submit(lambda: self.broker.try_start() or None) + executor.submit(lambda: self.broker.start() or None) ) # The `or None` is for the linter for pageserver in self.pageservers: @@ -1301,7 +1209,7 @@ class NeonEnv: pageserver.stop(immediate=immediate) except RuntimeError: stop_later.append(pageserver) - self.broker.stop(immediate=immediate) + self.broker.stop() # TODO: for nice logging we need python 3.11 ExceptionGroup for ps in stop_later: @@ -1409,13 +1317,12 @@ class NeonEnv: return "ep-" + str(self.endpoint_counter) -@pytest.fixture(scope=shareable_scope) -def _shared_simple_env( +@pytest.fixture(scope="function") +def neon_simple_env( request: FixtureRequest, pytestconfig: Config, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, test_output_dir: Path, @@ -1425,27 +1332,21 @@ def _shared_simple_env( pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `neon_simple_env`. + Simple Neon environment, with no authentication and no safekeepers. This fixture will use RemoteStorageKind.LOCAL_FS with pageserver. """ - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - repo_dir = get_test_repo_dir(request, top_output_dir) - else: - # We're running shared fixtures. Share a single directory. - repo_dir = top_output_dir / "shared_repo" - shutil.rmtree(repo_dir, ignore_errors=True) + # Create the environment in the per-test output directory + repo_dir = get_test_repo_dir(request, top_output_dir) with NeonEnvBuilder( top_output_dir=top_output_dir, repo_dir=repo_dir, port_distributor=port_distributor, - broker=default_broker, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, @@ -1457,30 +1358,13 @@ def _shared_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: env = builder.init_start() - # For convenience in tests, create a branch from the freshly-initialized cluster. - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - yield env -@pytest.fixture(scope="function") -def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: - """ - Simple Neon environment, with no authentication and no safekeepers. - - If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'neon_simple_env', keeping the - page server and safekeepers running. Any compute nodes are stopped after - each the test, however. - """ - yield _shared_simple_env - - _shared_simple_env.endpoints.stop_all() - - @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, @@ -1490,7 +1374,6 @@ def neon_env_builder( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, - default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, test_overlay_dir: Path, @@ -1499,6 +1382,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1525,7 +1409,6 @@ def neon_env_builder( neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, - broker=default_broker, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1534,6 +1417,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, @@ -1547,14 +1431,6 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"^Created timeline '(?P[^']+)'", re.MULTILINE -) -TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE -) - - class AbstractNeonCli(abc.ABC): """ A typed wrapper around an arbitrary Neon CLI tool. @@ -1783,6 +1659,9 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if timeline_id is None: + timeline_id = TimelineId.generate() + cmd = [ "timeline", "create", @@ -1790,23 +1669,16 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--timeline-id", + str(timeline_id), "--pg-version", self.env.pg_version, ] - if timeline_id is not None: - cmd.extend(["--timeline-id", str(timeline_id)]) - res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - return TimelineId(str(created_timeline_id)) + return timeline_id def create_branch( self, @@ -1814,12 +1686,17 @@ class NeonCli(AbstractNeonCli): ancestor_branch_name: Optional[str] = None, tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, + new_timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if new_timeline_id is None: + new_timeline_id = TimelineId.generate() cmd = [ "timeline", "branch", "--branch-name", new_branch_name, + "--timeline-id", + str(new_timeline_id), "--tenant-id", str(tenant_id or self.env.initial_tenant), ] @@ -1831,16 +1708,7 @@ class NeonCli(AbstractNeonCli): res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - if created_timeline_id is None: - raise Exception("could not find timeline id after `neon timeline create` invocation") - else: - return TimelineId(str(created_timeline_id)) + return TimelineId(str(new_timeline_id)) def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ @@ -1849,6 +1717,9 @@ class NeonCli(AbstractNeonCli): # main [b49f7954224a0ad25cc0013ea107b54b] # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE + ) res = self.raw_cli( ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) @@ -1959,6 +1830,18 @@ class NeonCli(AbstractNeonCli): args.extend(["-m", "immediate"]) return self.raw_cli(args) + def broker_start( + self, timeout_in_seconds: Optional[int] = None + ) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def broker_stop(self) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "stop"] + return self.raw_cli(cmd) + def endpoint_create( self, branch_name: str, @@ -2284,7 +2167,7 @@ class NeonStorageController(MetricsGetter, LogUtils): self.allowed_errors, ) - def pageserver_api(self) -> PageserverHttpClient: + def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient: """ The storage controller implements a subset of the pageserver REST API, for mapping per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those @@ -2293,7 +2176,7 @@ class NeonStorageController(MetricsGetter, LogUtils): auth_token = None if self.auth_enabled: auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) - return PageserverHttpClient(self.port, lambda: True, auth_token) + return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs) def request(self, method, *args, **kwargs) -> requests.Response: resp = requests.request(method, *args, **kwargs) @@ -2390,6 +2273,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "listen_http_port": node.service_port.http, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, + "availability_zone_id": node.az_id, } log.info(f"node_register({body})") self.request( @@ -2533,7 +2417,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_describe(self, tenant_id: TenantId): """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str} """ response = self.request( "GET", @@ -2543,6 +2427,30 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return response.json() + def nodes(self): + """ + :return: list of {"id": ""} + """ + response = self.request( + "GET", + f"{self.api}/control/v1/node", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + + def node_shards(self, node_id: NodeId): + """ + :return: list of {"shard_id": "", "is_secondary": bool} + """ + response = self.request( + "GET", + f"{self.api}/control/v1/node/{node_id}/shards", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + def tenant_shard_split( self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None ) -> list[TenantShardId]: @@ -2836,6 +2744,40 @@ class NeonStorageController(MetricsGetter, LogUtils): raise AssertionError("unreachable") + def on_safekeeper_deploy(self, id: int, body: dict[str, Any]): + self.request( + "POST", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + json=body, + ) + + def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + try: + response = self.request( + "GET", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + ) + json = response.json() + assert isinstance(json, dict) + return json + except StorageControllerApiException as e: + if e.status_code == 404: + return None + raise e + + def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]: + response = self.request( + "PUT", + f"{self.api}/control/v1/preferred_azs", + headers=self.headers(TokenScope.ADMIN), + json={str(tid): az for tid, az in preferred_azs.items()}, + ) + + response.raise_for_status() + return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def __enter__(self) -> "NeonStorageController": return self @@ -2913,10 +2855,11 @@ class NeonPageserver(PgProtocol, LogUtils): TEMP_FILE_SUFFIX = "___temp" - def __init__(self, env: NeonEnv, id: int, port: PageserverPort): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id + self.az_id = az_id self.running = False self.service_port = port self.version = env.get_binary_version("pageserver") @@ -2953,16 +2896,17 @@ class NeonPageserver(PgProtocol, LogUtils): def config_toml_path(self) -> Path: return self.workdir / "pageserver.toml" - def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T: """ Edit the pageserver's config toml file in place. """ path = self.config_toml_path with open(path, "r") as f: config = toml.load(f) - edit_fn(config) + res = edit_fn(config) with open(path, "w") as f: toml.dump(config, f) + return res def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: """ @@ -3320,12 +3264,12 @@ class PgBin: ) return base_path - def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn: """ Run pg_controldata on given datadir and extract checkpoint lsn. """ - pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + pg_controldata_path = self.pg_bin_path / "pg_controldata" cmd = f"{pg_controldata_path} -D {pgdata}" result = subprocess.run(cmd, capture_output=True, text=True, shell=True) checkpoint_lsn = re.findall( @@ -3408,6 +3352,7 @@ class VanillaPostgres(PgProtocol): assert not self.running with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) + conf_file.write("\n") def edit_hba(self, hba: List[str]): """Prepend hba lines into pg_hba.conf file.""" @@ -3433,9 +3378,9 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - def get_subdir_size(self, subdir) -> int: + def get_subdir_size(self, subdir: Path) -> int: """Return size of pgdatadir subdirectory in bytes.""" - return get_dir_size(os.path.join(self.pgdatadir, subdir)) + return get_dir_size(self.pgdatadir / subdir) def __enter__(self) -> "VanillaPostgres": return self @@ -3461,6 +3406,7 @@ def vanilla_pg( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) yield vanilla_pg @@ -3961,7 +3907,7 @@ class Endpoint(PgProtocol, LogUtils): self.env = env self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA + self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -4018,7 +3964,7 @@ class Endpoint(PgProtocol, LogUtils): allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" - self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.pgdata_dir = self.env.repo_dir / path self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] @@ -4071,21 +4017,21 @@ class Endpoint(PgProtocol, LogUtils): path = Path("endpoints") / self.endpoint_id return self.env.repo_dir / path - def pg_data_dir_path(self) -> str: + def pg_data_dir_path(self) -> Path: """Path to Postgres data directory""" - return os.path.join(self.endpoint_path(), "pgdata") + return self.endpoint_path() / "pgdata" - def pg_xact_dir_path(self) -> str: + def pg_xact_dir_path(self) -> Path: """Path to pg_xact dir""" - return os.path.join(self.pg_data_dir_path(), "pg_xact") + return self.pg_data_dir_path() / "pg_xact" - def pg_twophase_dir_path(self) -> str: + def pg_twophase_dir_path(self) -> Path: """Path to pg_twophase dir""" - return os.path.join(self.pg_data_dir_path(), "pg_twophase") + return self.pg_data_dir_path() / "pg_twophase" - def config_file_path(self) -> str: + def config_file_path(self) -> Path: """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" - return os.path.join(self.endpoint_path(), "postgresql.conf") + return self.endpoint_path() / "postgresql.conf" def config(self, lines: List[str]) -> "Endpoint": """ @@ -4140,7 +4086,7 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self, num_migrations: int = 10): + def wait_for_migrations(self, num_migrations: int = 11): with self.cursor() as cur: def check_migrations_done(): @@ -4250,7 +4196,7 @@ class Endpoint(PgProtocol, LogUtils): log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') self.safe_psql("checkpoint") assert self.pgdata_dir is not None # please mypy - return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 def clear_shared_buffers(self, cursor: Optional[Any] = None): """ @@ -4543,6 +4489,8 @@ class Safekeeper(LogUtils): def timeline_dir(self, tenant_id, timeline_id) -> Path: return self.data_dir / str(tenant_id) / str(timeline_id) + # List partial uploaded segments of this safekeeper. Works only for + # RemoteStorageKind.LOCAL_FS. def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId): tline_path = ( self.env.repo_dir @@ -4552,9 +4500,11 @@ class Safekeeper(LogUtils): / str(timeline_id) ) assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage) - return self._list_segments_in_dir( + segs = self._list_segments_in_dir( tline_path, lambda name: ".metadata" not in name and ".___temp" not in name ) + mysegs = [s for s in segs if f"sk{self.id}" in s] + return mysegs def list_segments(self, tenant_id, timeline_id) -> List[str]: """ @@ -4615,12 +4565,54 @@ class Safekeeper(LogUtils): wait_until(20, 0.5, paused) +class NeonBroker(LogUtils): + """An object managing storage_broker instance""" + + def __init__(self, env: NeonEnv): + super().__init__(logfile=env.repo_dir / "storage_broker.log") + self.env = env + self.port: int = self.env.port_distributor.get_port() + self.running = False + + def start( + self, + timeout_in_seconds: Optional[int] = None, + ): + assert not self.running + self.env.neon_cli.broker_start(timeout_in_seconds) + self.running = True + return self + + def stop(self): + if self.running: + self.env.neon_cli.broker_stop() + self.running = False + return self + + def listen_addr(self): + return f"127.0.0.1:{self.port}" + + def client_url(self): + return f"http://{self.listen_addr()}" + + def assert_no_errors(self): + assert_no_errors(self.logfile, "storage_controller", []) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class NodeKind(str, Enum): + PAGESERVER = "pageserver" + SAFEKEEPER = "safekeeper" + + class StorageScrubber: def __init__(self, env: NeonEnv, log_dir: Path): self.env = env self.log_dir = log_dir - def scrubber_cli(self, args: list[str], timeout) -> str: + def scrubber_cli( + self, args: list[str], timeout, extra_env: Optional[Dict[str, str]] = None + ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -4635,6 +4627,9 @@ class StorageScrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) + if extra_env is not None: + env.update(extra_env) + base_args = [ str(self.env.neon_binpath / "storage_scrubber"), f"--controller-api={self.env.storage_controller.api_root()}", @@ -4662,18 +4657,43 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + def scan_metadata_safekeeper( + self, + timeline_lsns: List[Dict[str, Any]], + cloud_admin_api_url: str, + cloud_admin_api_token: str, + ) -> Tuple[bool, Any]: + extra_env = { + "CLOUD_ADMIN_API_URL": cloud_admin_api_url, + "CLOUD_ADMIN_API_TOKEN": cloud_admin_api_token, + } + return self.scan_metadata( + node_kind=NodeKind.SAFEKEEPER, timeline_lsns=timeline_lsns, extra_env=extra_env + ) + + def scan_metadata( + self, + post_to_storage_controller: bool = False, + node_kind: NodeKind = NodeKind.PAGESERVER, + timeline_lsns: Optional[List[Dict[str, Any]]] = None, + extra_env: Optional[Dict[str, str]] = None, + ) -> Tuple[bool, Any]: """ Returns the health status and the metadata summary. """ - args = ["scan-metadata", "--node-kind", "pageserver", "--json"] + args = ["scan-metadata", "--node-kind", node_kind.value, "--json"] if post_to_storage_controller: args.append("--post") - stdout = self.scrubber_cli(args, timeout=30) + if timeline_lsns is not None: + args.append("--timeline-lsns") + args.append(json.dumps(timeline_lsns)) + stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env) try: summary = json.loads(stdout) - healthy = not summary["with_errors"] and not summary["with_warnings"] + # summary does not contain "with_warnings" if node_kind is the safekeeper + no_warnings = "with_warnings" not in summary or not summary["with_warnings"] + healthy = not summary["with_errors"] and no_warnings return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") @@ -4771,14 +4791,7 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. It also solves a problem with the -# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it -# creates the repo in the test output directory. But it cannot depend on -# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, -# it has 'session' scope and cannot access fixtures with 'function' -# scope. So it uses the get_test_output_dir() function to get the path, and -# this fixture ensures that the directory exists. That works because -# 'autouse' fixtures are run before other fixtures. +# if a test doesn't put anything there. # # NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index dff002bd4b..f8d9a51c91 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -52,9 +52,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Error processing HTTP request: Forbidden", # intentional failpoints ".*failpoint ", - # FIXME: These need investigation - ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", - ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index cd4261f1b8..582f9c0264 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -10,7 +10,7 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion @@ -621,6 +621,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) + def timeline_archival_config( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + state: TimelineArchivalState, + ): + config = {"state": state.value} + log.info( + f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}" + ) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config", + json=config, + ) + self.verbose_error(res) + def timeline_get_lsn_by_timestamp( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 3e0ffabf74..97e63ed4ba 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -39,7 +39,7 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - log.info(f"duplicating template tenant {ncopies} times in S3") + log.info(f"duplicating template tenant {ncopies} times in remote storage") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index b75a480a63..a74fef6a60 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -430,12 +430,17 @@ def enable_remote_storage_versioning( return response -MANY_SMALL_LAYERS_TENANT_CONFIG = { - "gc_period": "0s", - "compaction_period": "0s", - "checkpoint_distance": 1024**2, - "image_creation_threshold": 100, -} +def many_small_layers_tenant_config() -> Dict[str, Any]: + """ + Create a new dict to avoid issues with deleting from the global value. + In python, the global is mutable. + """ + return { + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": 1024**2, + "image_creation_threshold": 100, + } def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 92c98763e3..2c8e71526c 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -24,7 +24,7 @@ def build_type() -> Optional[str]: return None -@pytest.fixture(scope="function", autouse=True) +@pytest.fixture(scope="session", autouse=True) def platform() -> Optional[str]: return None @@ -34,6 +34,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_io_buffer_alignment() -> Optional[int]: + return None + + @pytest.fixture(scope="function", autouse=True) def pageserver_aux_file_policy() -> Optional[AuxFileStore]: return None diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index e12c8e5f4a..258935959b 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" + V17 = "17" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index dd3a0a3d54..96c84d1616 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -50,6 +50,19 @@ class SafekeeperMetrics(Metrics): ).value +@dataclass +class TermBumpResponse: + previous_term: int + current_term: int + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse": + return TermBumpResponse( + previous_term=d["previous_term"], + current_term=d["current_term"], + ) + + class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError @@ -65,6 +78,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" + request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result.raise_for_status() + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + res = self.get_metrics_str() + return SafekeeperMetrics(parse_metrics(res)) + def is_testing_enabled_or_skip(self): if not self.is_testing_enabled: pytest.skip("safekeeper was built without 'testing' feature") @@ -89,56 +112,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert res_json is None return res_json - def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: - params = params or {} - res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) - res.raise_for_status() - res_json = json.loads(res.text) - assert isinstance(res_json, dict) - return res_json - - def patch_control_file( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - patch: Dict[str, Any], - ) -> Dict[str, Any]: - res = self.patch( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file", - json={ - "updates": patch, - "apply_fields": list(patch.keys()), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: - res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", - json=body, - ) - res.raise_for_status() - - def timeline_digest( - self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn - ) -> Dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", - params={ - "from_lsn": str(from_lsn), - "until_lsn": str(until_lsn), - }, - ) + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") res.raise_for_status() res_json = res.json() assert isinstance(res_json, dict) @@ -189,20 +164,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn - def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): - res = self.post( - f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body, - ) - res.raise_for_status() - - def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", - json={}, - ) - res.raise_for_status() - # only_local doesn't remove segments in the remote storage. def timeline_delete( self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False @@ -218,19 +179,111 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + params = params or {} + res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) + res.raise_for_status() + res_json = json.loads(res.text) + assert isinstance(res_json, dict) + return res_json + + def debug_dump_timeline( + self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None + ) -> Any: + params = params or {} + params["timeline_id"] = str(timeline_id) + dump = self.debug_dump(params) + return dump["timelines"][0] + + def get_partial_backup(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["partial_backup"] + + def get_eviction_state(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["eviction_state"] + + def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: + res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) res.raise_for_status() res_json = res.json() assert isinstance(res_json, dict) return res_json - def get_metrics_str(self) -> str: - """You probably want to use get_metrics() instead.""" - request_result = self.get(f"http://localhost:{self.port}/metrics") - request_result.raise_for_status() - return request_result.text + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() - def get_metrics(self) -> SafekeeperMetrics: - res = self.get_metrics_str() - return SafekeeperMetrics(parse_metrics(res)) + def patch_control_file( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + patch: Dict[str, Any], + ) -> Dict[str, Any]: + res = self.patch( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file", + json={ + "updates": patch, + "apply_fields": list(patch.keys()), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", + json={}, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset", + json={}, + ) + res.raise_for_status() + return res.json() + + def term_bump( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + term: Optional[int], + ) -> TermBumpResponse: + body = {} + if term is not None: + body["term"] = term + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump", + json=body, + ) + res.raise_for_status() + return TermBumpResponse.from_json(res.json()) + + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body, + ) + res.raise_for_status() diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index b3866f1813..f1ab7876f9 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") - neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) branch_creation_durations = [] diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 667d1a4c4a..f8d39487f2 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): env.neon_cli.create_branch("root") endpoint_root = env.endpoints.create_start("root") - pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"]) fork_at_current_lsn(env, endpoint_root, "child", "root") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 077f73ac06..dbf94a2cf5 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -22,17 +22,15 @@ if TYPE_CHECKING: def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start("test_logical_replication") + endpoint = env.endpoints.create_start("main") - log.info("postgres is running on 'test_logical_replication' branch") - pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history") # now start subscriber vanilla_pg.start() - pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()]) vanilla_pg.safe_psql("truncate table pgbench_accounts") vanilla_pg.safe_psql("truncate table pgbench_history") @@ -101,9 +99,9 @@ def test_subscriber_lag( sub_connstr = benchmark_project_sub.connstr if benchmark_project_pub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) if benchmark_project_sub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -195,8 +193,8 @@ def test_publisher_restart( pub_connstr = benchmark_project_pub.connstr sub_connstr = benchmark_project_sub.connstr - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -290,7 +288,7 @@ def test_snap_files( is_super = cur.fetchall()[0][0] assert is_super, "This benchmark won't work if we don't have superuser" - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env) conn = psycopg2.connect(connstr) conn.autocommit = True diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 7e16197211..49b1176d34 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -85,7 +85,7 @@ def test_ro_replica_lag( endpoint_id=replica["endpoint"]["id"], )["uri"] - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env) master_workload = pg_bin.run_nonblocking( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], @@ -212,7 +212,7 @@ def test_replication_start_stop( for i in range(num_replicas): replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] - pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env) # Sync replicas with psycopg2.connect(master_connstr) as conn_master: diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 297aedfbed..a186bbaceb 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -84,7 +84,7 @@ def test_storage_controller_many_tenants( compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 513ebc74c3..c824e60c29 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -2,14 +2,14 @@ import statistics import threading import time import timeit -from typing import Any, Callable, List +from typing import Any, Callable, Generator, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -20,7 +20,7 @@ from performance.test_perf_pgbench import get_durations_matrix, get_scales_matri # For example, to build a `NeonCompare` interface, the corresponding fixture's param should have # a format of `neon_{safekeepers_enable_fsync}`. # Note that, here "_" is used to separate builder parameters. -def pg_compare(request) -> PgCompare: +def pg_compare(request) -> Generator[PgCompare, None, None]: x = request.param.split("_") if x[0] == "vanilla": @@ -28,7 +28,7 @@ def pg_compare(request) -> PgCompare: fixture = request.getfixturevalue("vanilla_compare") assert isinstance(fixture, VanillaCompare) - return fixture + yield fixture else: assert ( len(x) == 2 @@ -47,10 +47,15 @@ def pg_compare(request) -> PgCompare: neon_env_builder.safekeepers_enable_fsync = x[1] == "on" env = neon_env_builder.init_start() - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - branch_name = request.node.name - return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + cmp = NeonCompare(zenbenchmark, env, pg_bin) + + yield cmp + + flush_ep_to_pageserver(env, cmp._pg, cmp.tenant, cmp.timeline) + env.pageserver.http_client().timeline_checkpoint( + cmp.tenant, cmp.timeline, compact=False, wait_until_uploaded=True + ) def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json new file mode 100644 index 0000000000..7990b2c3a2 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG17 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index a7eda73d4c..bb337d9cc1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -162,7 +162,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "min_resident_size_override": 23, "timeline_get_throttle": { "task_kinds": ["PageRequestHandler"], - "fair": True, "initial": 0, "refill_interval": "1s", "refill_amount": 1000, diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 7cb85e3dd1..780c0e1602 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"pagestream {env.initial_tenant} {env.initial_timeline}", + f"pagestream_v2 {env.initial_tenant} {env.initial_timeline}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 170b494884..13c080ea0e 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -8,11 +8,10 @@ from fixtures.neon_fixtures import NeonEnv # def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_basebackup_error", "empty") pageserver_http = env.pageserver.http_client() # Introduce failpoint pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): - env.endpoints.create_start("test_basebackup_error") + env.endpoints.create_start("main") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index fc74707639..1729e2fc98 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -52,7 +52,7 @@ def test_branching_with_pgbench( def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-T15", connstr]) env.neon_cli.create_branch("b0", tenant_id=tenant) diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 26e6e336b9..6e4880841a 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -11,7 +11,6 @@ from fixtures.utils import query_scalar # def test_clog_truncate(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_clog_truncate", "empty") # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "autovacuum_freeze_max_age=100000", ] - endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) + endpoint = env.endpoints.create_start("main", config_lines=config) # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # create new branch after clog truncation and start a compute node on it log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") env.neon_cli.create_branch( - "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index de27191945..fb5c1d3115 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -11,12 +11,17 @@ import pytest import toml from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + flush_ep_to_pageserver, +) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -144,9 +149,16 @@ def test_create_snapshot( ) +# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning +ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" + + @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -168,7 +180,8 @@ def test_backward_compatibility( try: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") - neon_env_builder.start() + env.pageserver.allowed_errors.append(ingest_lag_log_line) + env.start() check_neon_works( env, @@ -176,6 +189,9 @@ def test_backward_compatibility( sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + + env.pageserver.assert_log_contains(ingest_lag_log_line) + except Exception: if breaking_changes_allowed: pytest.xfail( @@ -190,6 +206,9 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -233,6 +252,8 @@ def test_forward_compatibility( env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", ) + # there may be an arbitrary number of unrelated tests run between create_snapshot and here + env.pageserver.allowed_errors.append(ingest_lag_log_line) # not using env.pageserver.version because it was initialized before prev_pageserver_version_str = env.get_binary_version("pageserver") @@ -250,7 +271,7 @@ def test_forward_compatibility( # does not include logs from previous runs assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) - neon_env_builder.start() + env.start() # ensure the specified pageserver is running assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) @@ -296,7 +317,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r pg_version = env.pg_version # Stop endpoint while we recreate timeline - ep.stop() + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) try: pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) @@ -344,6 +365,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r assert not dump_from_wal_differs, "dump from WAL differs" assert not initial_dump_differs, "initial dump differs" + flush_ep_to_pageserver(env, ep, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, compact=False, wait_until_uploaded=True + ) + def dump_differs( first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index dd36190fcd..8b8c970357 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -4,9 +4,8 @@ from fixtures.neon_fixtures import NeonEnv def test_compute_catalog(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) client = endpoint.http_client() objects = client.dbs_and_roles() diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 4bb7df1e6a..d8ef0b8dbd 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,6 +1,7 @@ +import os from contextlib import closing -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder # @@ -8,10 +9,9 @@ from fixtures.neon_fixtures import NeonEnv # def test_config(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") # change config - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -28,3 +28,45 @@ def test_config(neon_simple_env: NeonEnv): # check that config change was applied assert cur.fetchone() == ("debug1",) + + +# +# Test that reordering of safekeepers does not restart walproposer +# +def test_safekeepers_reconfigure_reorder( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder") + + endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder") + + old_sks = "" + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + old_sks = res[0] + + # Reorder safekeepers + safekeepers = endpoint.active_safekeepers + safekeepers = safekeepers[1:] + safekeepers[:1] + + endpoint.reconfigure(safekeepers=safekeepers) + + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.safekeepers") + res = cur.fetchone() + assert res is not None, "neon.safekeepers GUC is set" + new_sks = res[0] + + assert new_sks != old_sks, "GUC changes were applied" + + log_path = os.path.join(endpoint.endpoint_path(), "compute.log") + with open(log_path, "r") as log_file: + logs = log_file.read() + # Check that walproposer was not restarted + assert "restarting walproposer" not in logs diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index f741a9fc87..af643f45d7 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): if env.pg_version == PgVersion.V14 and strategy == "wal_log": pytest.skip("wal_log strategy not supported on PostgreSQL 14") - env.neon_cli.create_branch("test_createdb", "empty") - - endpoint = env.endpoints.create_start("test_createdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createdb2") # Test that you can connect to the new database on both branches @@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): # def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_dropdb", "empty") - endpoint = env.endpoints.create_start("test_dropdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") @@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch( - "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop - ) + env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop) endpoint_before = env.endpoints.create_start("test_before_dropdb") - env.neon_cli.create_branch( - "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop - ) + env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop) endpoint_after = env.endpoints.create_start("test_after_dropdb") # Test that database exists on the branch before drop diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index 17d9824f52..d6f138e126 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -7,8 +7,7 @@ from fixtures.utils import query_scalar # def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_createuser", "empty") - endpoint = env.endpoints.create_start("test_createuser") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createuser2") # Test that you can connect to new branch as a new user diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 50da673d87..65f310c27a 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str): # Here we test the latter. The first one is tested in test_ddl_forwarding def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty") endpoint = env.endpoints.create_start( - "test_ddl_forwarding_invalid_db", + "main", # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 85616c3fe2..1fec8b3f18 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -291,7 +291,7 @@ def pgbench_init_tenant( ) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()]) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) return (tenant_id, timeline_id) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 27eb05ac09..7370eb1456 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -44,6 +44,8 @@ def test_remote_extensions( ): if pg_version == PgVersion.V16: pytest.skip("TODO: PG16 extension building") + if pg_version == PgVersion.V17: + pytest.skip("TODO: PG17 extension building") # setup mock http server # that expects request for anon.tar.zst diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 5231dedcda..0217c9ac7b 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_explain_with_lfc_stats" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index ae63136abb..d94704012f 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): def run_pgbench(connstr: str, pg_bin: PgBin): log.info(f"Start a pgbench workload on pg {connstr}") # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr]) log.info("pgbench init done") pg_bin.run_capture(["pgbench", "-T60", connstr]) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 1b2c7f808f..cb0b30d9c6 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -16,9 +16,8 @@ from fixtures.neon_fixtures import NeonEnv, PgBin @pytest.mark.timeout(600) def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env - env.neon_cli.create_branch("test_lfc_resize", "empty") endpoint = env.endpoints.create_start( - "test_lfc_resize", + "main", config_lines=[ "neon.file_cache_path='file.cache'", "neon.max_file_cache_size=512MB", diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 4c53e4e2fd..4a3a949d1a 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_approximate_working_set_size" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 3c404c3b23..9c38200937 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -5,7 +5,7 @@ import threading import time from typing import List -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar @@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") - endpoint = env.endpoints.create_start( - "test_local_file_cache_unlink", + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 0d18aa43b7..15a3719e0b 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -22,7 +22,7 @@ def random_string(n: int): @pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation] + "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation] ) def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): env = neon_simple_env @@ -31,17 +31,13 @@ def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] -) +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start( - "test_logical_replication", config_lines=["log_statement=all"] - ) + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"]) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -175,9 +171,7 @@ COMMIT; # Test that neon.logical_replication_max_snap_files works -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] -) +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): def slot_removed(ep): assert ( @@ -189,10 +183,9 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") # set low neon.logical_replication_max_snap_files endpoint = env.endpoints.create_start( - "test_logical_replication", + "main", config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], ) @@ -355,9 +348,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of # # Most pages start with a contrecord, so we don't do anything special # to ensure that. -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] -) +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -402,9 +393,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): # logical replication bug as such, but without logical replication, # records passed ot the WAL redo process are never large enough to hit # the bug. -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] -) +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -476,13 +465,11 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] -) +@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env - env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + env.neon_cli.create_branch("test_replication_shutdown_publisher", "main") pub = env.endpoints.create("test_replication_shutdown_publisher") env.neon_cli.create_branch("test_replication_shutdown_subscriber") diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index bdc5ca907e..7211619a99 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -9,13 +9,12 @@ if TYPE_CHECKING: def test_migrations(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_migrations", "empty") - endpoint = env.endpoints.create("test_migrations") + endpoint = env.endpoints.create("main") endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - num_migrations = 10 + num_migrations = 11 endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 88f7a5db59..8a00f8835f 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -14,8 +14,7 @@ from fixtures.utils import query_scalar # def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_multixact", "empty") - endpoint = env.endpoints.create_start("test_multixact") + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( @@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + env.neon_cli.create_branch( + "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn + ) endpoint_new = env.endpoints.create_start("test_multixact_new") next_multixact_id_new = endpoint_new.safe_psql( diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index ba170cfb4c..b65430ff49 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index fd31df84da..7825ec772c 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -6,7 +6,7 @@ from fixtures.utils import wait_until def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + env.neon_cli.create_branch("test_neon_superuser_publisher", "main") pub = env.endpoints.create("test_neon_superuser_publisher") env.neon_cli.create_branch("test_neon_superuser_subscriber") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 73af7950f1..c923713432 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -134,7 +134,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() for sk in env.safekeepers: sk.start() env.storage_controller.start() @@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - replaced_config = env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": "", - } - ) + def remove_control_plane_api_field(config): + return config.pop("control_plane_api") + + control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) @@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive(replaced_config) + env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": control_plane_api, + } + ) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 111285b40c..4c9eac5cd7 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,3 +1,4 @@ +import copy import json import uuid @@ -116,3 +117,58 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P assert ( duration_secs >= 10 * actual_smgr_query_seconds ), "smgr metrics should not include throttle wait time" + + +throttle_config_with_field_fair_set = { + "task_kinds": ["PageRequestHandler"], + "fair": True, + "initial": 27, + "refill_interval": "43s", + "refill_amount": 23, + "max": 42, +} + + +def assert_throttle_config_with_field_fair_set(conf): + """ + Field `fair` is ignored, so, responses don't contain it + """ + without_fair = copy.deepcopy(throttle_config_with_field_fair_set) + without_fair.pop("fair") + + assert conf == without_fair + + +def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + # with_fair config should still be settable + ps_http.set_tenant_config( + env.initial_tenant, + {"timeline_get_throttle": throttle_config_with_field_fair_set}, + ) + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) + assert_throttle_config_with_field_fair_set( + conf.tenant_specific_overrides["timeline_get_throttle"] + ) + + +def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( + neon_env_builder: NeonEnvBuilder, +): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + + def set_tenant_config(ps_cfg): + ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set} + + neon_env_builder.pageserver_config_override = set_tenant_config + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 66b6185aaa..f6404d68ac 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -247,9 +247,10 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): compaction_period_s = 10 + checkpoint_distance = 1024**3 tenant_conf = { # Large space + time thresholds: effectively disable these limits - "checkpoint_distance": f"{1024 ** 4}", + "checkpoint_distance": f"{checkpoint_distance}", "checkpoint_timeout": "3600s", "compaction_period": f"{compaction_period_s}s", } @@ -269,7 +270,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): for tenant, timeline, last_flush_lsn in last_flush_lsns: http_client = env.pageserver.http_client() initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) - total_bytes_ingested += last_flush_lsn - initdb_lsn + this_timeline_ingested = last_flush_lsn - initdb_lsn + assert ( + this_timeline_ingested < checkpoint_distance * 0.8 + ), "this test is supposed to fill InMemoryLayer" + total_bytes_ingested += this_timeline_ingested log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") assert total_bytes_ingested > max_dirty_data diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 24a37b04ec..37ab51f9fb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -74,7 +74,7 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", + ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*", ] ) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index 37ff923632..ada6da98ff 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 65569f3bac..9bb9b373ad 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index b33e387a66..a5037e8694 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env - env.neon_cli.create_branch("test_parallel_copy", "empty") - endpoint = env.endpoints.create_start("test_parallel_copy") + endpoint = env.endpoints.create_start("main") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py index bad2e5865e..c6b4eff516 100644 --- a/test_runner/regress/test_pg_query_cancellation.py +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv): ps_http = ps.http_client() ps_http.is_testing_enabled_or_skip() - env.neon_cli.create_branch("test_config", "empty") - # We don't want to have any racy behaviour with autovacuum IOs ep = env.endpoints.create_start( - "test_config", + "main", config_lines=[ "autovacuum = off", "shared_buffers = 128MB", diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py index 8e80efd9ba..1990d69b6a 100644 --- a/test_runner/regress/test_pg_waldump.py +++ b/test_runner/regress/test_pg_waldump.py @@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") - endpoint = env.endpoints.create_start("test_pg_waldump") + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 03e8c7c0df..d8626c15a5 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -20,7 +20,10 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): output = f.read().strip() # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". - pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers. + pattern = ( + r"postgres \(PostgreSQL\) (?P\d+(?:beta|rc|\.)\d+) \((?P[0-9a-f]{40})\)" + ) match = re.search(pattern, output, re.IGNORECASE) assert match is not None, f"Can't parse {output} with {pattern}" @@ -29,7 +32,6 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): assert ( pg_version.v_prefixed in expected_revisions - ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index d128c60a99..78798c5abf 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -15,17 +15,8 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation", "empty") - - endpoint = env.endpoints.create_start( - "test_read_validation", - # Use protocol version 2, because the code that constructs the V1 messages - # assumes that a primary always wants to read the latest version of a page, - # and therefore doesn't work with the test functions below to read an older - # page version. - config_lines=["neon.protocol_version=2"], - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: @@ -136,18 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv): def test_read_validation_neg(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation_neg", "empty") - env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start( - "test_read_validation_neg", - # Use protocol version 2, because the code that constructs the V1 messages - # assumes that a primary always wants to read the latest version of a page, - # and therefore doesn't work with the test functions below to read an older - # page version. - config_lines=["neon.protocol_version=2"], - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index ba8b91e84d..347fc3a04d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,7 +1,15 @@ +import time + import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + last_flush_lsn_upload, + tenant_get_shards, +) from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.utils import query_scalar @@ -14,10 +22,14 @@ from fixtures.utils import query_scalar # def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_readonly_node", "empty") - endpoint_main = env.endpoints.create_start("test_readonly_node") + endpoint_main = env.endpoints.create_start("main") - env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") + env.pageserver.allowed_errors.extend( + [ + ".*basebackup .* failed: invalid basebackup lsn.*", + ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected", + ] + ) main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() @@ -61,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Create first read-only node at the point where only 100 rows were inserted endpoint_hundred = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a + branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a ) # And another at the point where 200100 rows were inserted endpoint_more = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b + branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b ) # On the 'hundred' node, we should see only 100 rows @@ -87,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Check creating a node at segment boundary endpoint = env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-branch_segment_boundary", lsn=Lsn("0/3000000"), ) @@ -99,23 +111,116 @@ def test_readonly_node(neon_simple_env: NeonEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-readonly_node_preinitdb", lsn=Lsn("0/42"), ) +def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): + """ + Test static endpoint is protected from GC by acquiring and renewing lsn leases. + """ + + neon_env_builder.num_pageservers = 2 + # GC is manual triggered. + env = neon_env_builder.init_start( + initial_tenant_conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + # Short lease length to fit test. + "lsn_lease_length": "3s", + }, + initial_tenant_shard_count=2, + ) + + ROW_COUNT = 500 + + def generate_updates_on_main( + env: NeonEnv, + ep_main: Endpoint, + data: int, + start=1, + end=ROW_COUNT, + ) -> Lsn: + """ + Generates some load on main branch that results in some uploads. + """ + with ep_main.cursor() as cur: + cur.execute( + f"INSERT INTO t0 (v0, v1) SELECT g, '{data}' FROM generate_series({start}, {end}) g ON CONFLICT (v0) DO UPDATE SET v1 = EXCLUDED.v1" + ) + cur.execute("VACUUM t0") + last_flush_lsn = last_flush_lsn_upload( + env, ep_main, env.initial_tenant, env.initial_timeline + ) + return last_flush_lsn + + # Insert some records on main branch + with env.endpoints.create_start("main") as ep_main: + with ep_main.cursor() as cur: + cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") + lsn = None + for i in range(2): + lsn = generate_updates_on_main(env, ep_main, i) + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="static", + lsn=lsn, + ) as ep_static: + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + time.sleep(3) + + generate_updates_on_main(env, ep_main, i, end=100) + + # Trigger GC + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert ( + gc_result["layers_removed"] == 0 + ), "No layers should be removed, old layers are guarded by leases." + + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + # Do some update so we can increment latest_gc_cutoff + generate_updates_on_main(env, ep_main, i, end=100) + + # Now trigger GC again, layers should be removed. + time.sleep(4) + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert gc_result["layers_removed"] > 0, "Old layers should be removed after leases expired." + + # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env - pageserver_http_client = env.pageserver.http_client() - env.neon_cli.create_branch("test_timetravel", "empty") - endpoint = env.endpoints.create_start("test_timetravel") - + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline client = env.pageserver.http_client() - - tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] - timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + endpoint = env.endpoints.create_start("main") lsns = [] @@ -139,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.endpoints.stop_all() @@ -148,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv): for i, lsn in lsns: endpoint_old = env.endpoints.create_start( - branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn + branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn ) with endpoint_old.cursor() as cur: assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 9992647e56..c1a80a54bc 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -8,9 +8,9 @@ from fixtures.neon_fixtures import ( PgBin, ) from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, enable_remote_storage_versioning, + many_small_layers_tenant_config, wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind, s3_storage @@ -33,7 +33,7 @@ def test_tenant_s3_restore( # change it back after initdb, recovery doesn't work if the two # index_part.json uploads happen at same second or too close to each other. - initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG + initial_tenant_conf = many_small_layers_tenant_config() del initial_tenant_conf["checkpoint_distance"] env = neon_env_builder.init_start(initial_tenant_conf) @@ -50,7 +50,7 @@ def test_tenant_s3_restore( tenant_id = env.initial_tenant # now lets create the small layers - ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG) + ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config()) # Default tenant and the one we created assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 1011a6fd22..4a84dca399 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -374,7 +374,7 @@ def test_sharding_split_smoke( non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} env = neon_env_builder.init_configs(True) - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.neon_cli.create_tenant( @@ -394,6 +394,7 @@ def test_sharding_split_smoke( # Note which pageservers initially hold a shard after tenant creation pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + log.info("Pre-split pageservers: {pre_split_pageserver_ids}") # For pageservers holding a shard, validate their ingest statistics # reflect a proper splitting of the WAL. @@ -555,9 +556,9 @@ def test_sharding_split_smoke( assert sum(total.values()) == split_shard_count * 2 check_effective_tenant_config() - # More specific check: that we are fully balanced. This is deterministic because - # the order in which we consider shards for optimization is deterministic, and the - # order of preference of nodes is also deterministic (lower node IDs win). + # More specific check: that we are fully balanced. It is deterministic that we will get exactly + # one shard on each pageserver, because for these small shards the utilization metric is + # dominated by shard count. log.info(f"total: {total}") assert total == { 1: 1, @@ -577,8 +578,14 @@ def test_sharding_split_smoke( 15: 1, 16: 1, } + + # The controller is not required to lay out the attached locations in any particular way, but + # all the pageservers that originally held an attached shard should still hold one, otherwise + # it would indicate that we had done some unnecessary migration. log.info(f"attached: {attached}") - assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1} + for ps_id in pre_split_pageserver_ids: + log.info("Pre-split pageserver {ps_id} should still hold an attached location") + assert ps_id in attached # Ensure post-split pageserver locations survive a restart (i.e. the child shards # correctly wrote config to disk, and the storage controller responds correctly @@ -1429,7 +1436,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() @@ -1468,7 +1475,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenants = [] n_tenants = 8 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 94d71a7677..dc90a6e9a0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest +from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log @@ -18,20 +19,19 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, StorageControllerLeadershipStatus, - TokenScope, last_flush_lsn_upload, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, enable_remote_storage_versioning, list_prefix, + many_small_layers_tenant_config, remote_storage_delete_key, timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, run_only_on_default_postgres from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.storage_controller_proxy import StorageControllerProxy @@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) from pytest_httpserver import HTTPServer +from urllib3 import Retry from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -68,7 +69,7 @@ def test_storage_controller_smoke( env = neon_env_builder.init_configs() # Start services by hand so that we can skip a pageserver (this will start + register later) - env.broker.try_start() + env.broker.start() env.storage_controller.start() env.pageservers[0].start() env.pageservers[1].start() @@ -291,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() env.storage_controller.start() # This is the pageserver where we'll initially create the tenant. Run it in emergency @@ -654,7 +655,7 @@ def test_storage_controller_s3_time_travel_recovery( tenant_id, shard_count=2, shard_stripe_size=8192, - tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG, + tenant_config=many_small_layers_tenant_config(), ) # Check that the consistency check passes @@ -1551,6 +1552,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto literal_shard_count = 1 if shard_count is None else shard_count assert len(describe["shards"]) == literal_shard_count + nodes = env.storage_controller.nodes() + assert len(nodes) == 2 + describe1 = env.storage_controller.node_shards(nodes[0]["id"]) + describe2 = env.storage_controller.node_shards(nodes[1]["id"]) + assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count + # Check the data is still there: this implicitly proves that we recovered generation numbers # properly, for the timeline which was written to after a generation bump. for timeline, branch, expect_rows in [ @@ -2041,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) - env.storage_controller.allowed_errors.append( - ".*Accepted configuration update but reconciliation failed.*" + env.storage_controller.allowed_errors.extend( + [ + ".*Accepted configuration update but reconciliation failed.*", + ".*Leader is stepped down instance", + ] ) observed_state = env.storage_controller.step_down() @@ -2065,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): assert "compaction_threshold" in ps_tenant_conf.effective_config assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 - # Validate that the storcon is not replying to the usual requests - # once it has stepped down. - with pytest.raises(StorageControllerApiException, match="stepped_down"): + # Validate that the storcon attempts to forward the request, but stops. + # when it realises it is still the current leader. + with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): env.storage_controller.tenant_list() # Validate that we can step down multiple times and the observed state @@ -2116,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int): max_workers=2 + len(env.pageservers) + len(env.safekeepers) ) as executor: futs.append( - executor.submit(lambda: env.broker.try_start() or None) + executor.submit(lambda: env.broker.start() or None) ) # The `or None` is for the linter for pageserver in env.pageservers: @@ -2214,6 +2224,15 @@ def test_storage_controller_leadership_transfer( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_list() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" + if step_down_times_out: env.storage_controller.allowed_errors.extend( [ @@ -2266,3 +2285,300 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB # allow for small delay between actually having cancelled and being able reconfigure again wait_until(4, 0.5, reconfigure_node_again) + + +def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): + """ + The storage controller is meant to handle the case where a timeline CRUD operation races + with a generation-incrementing change to the tenant: this should trigger a retry so that + the operation lands on the highest-generation'd tenant location. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + # Set up a failpoint so that a timeline creation will be very slow + failpoint = "timeline-creation-after-uninit" + for ps in env.pageservers: + ps.http_client().configure_failpoints((failpoint, "sleep(10000)")) + + # Start a timeline creation in the background + create_timeline_id = TimelineId.generate() + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(env.pageservers) + len(env.safekeepers) + ) as executor: + futs.append( + executor.submit( + env.storage_controller.pageserver_api( + retries=Retry( + status=0, + connect=0, # Disable retries: we want to see the 503 + ) + ).timeline_create, + PgVersion.NOT_SET, + tenant_id, + create_timeline_id, + ) + ) + + def has_hit_failpoint(): + assert any( + ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers + ) + + wait_until(10, 1, has_hit_failpoint) + + # Migrate the tenant while the timeline creation is in progress: this migration will complete once it + # can detach from the old pageserver, which will happen once the failpoint completes. + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_id, 0, 0), env.pageservers[1].id + ) + + with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"): + futs[0].result(timeout=20) + + # Timeline creation should work when there isn't a concurrent migration, even though it's + # slow (our failpoint is still enabled) + env.storage_controller.pageserver_api( + retries=Retry( + status=0, + connect=0, # Disable retries: we want to see the 503 + ) + ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id) + + +def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder): + """ + A correctness edge case: while we are live migrating and a shard's generation is + visible to the Reconciler but not to the central Service, the generation validation + API should still prevent stale generations from doing deletions. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_configs() + env.start() + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + } + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + env.neon_cli.create_tenant(tenant_id, timeline_id) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF) + + # Write enough data that a compaction would do some work (deleting some L0s) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 2): + workload.churn_rows(64, upload=False) + + # Upload but don't compact + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + origin_pageserver.http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # Start a compaction that will pause on a failpoint. + compaction_failpoint = "before-upload-index-pausable" + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause")) + + # This failpoint can also cause migration code to time out trying to politely flush + # during migrations + origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*") + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + compact_fut = executor.submit( + origin_pageserver.http_client().timeline_compact, + tenant_id, + timeline_id, + wait_until_uploaded=True, + ) + + # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's + # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete + def has_hit_compaction_failpoint(): + assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") + + wait_until(10, 1, has_hit_compaction_failpoint) + + # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, + # after incrementing generation and attaching the new location + migration_failpoint = "reconciler-live-migrate-post-notify" + env.storage_controller.configure_failpoints((migration_failpoint, "pause")) + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + TenantShardId(tenant_id, 0, 0), + dest_ps_id, + ) + + def has_hit_migration_failpoint(): + assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}") + + # Long wait because the migration will have to time out during transition to AttachedStale + # before it reaches this point. The timeout is because the AttachedStale transition includes + # a flush of remote storage, and if the compaction already enqueued an index upload this cannot + # make progress. + wait_until(60, 1, has_hit_migration_failpoint) + + # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + compact_fut.result() + origin_pageserver.http_client().deletion_queue_flush(execute=True) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + migrate_fut.result() + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + raise + + # Ensure the destination of the migration writes an index, so that if it has corrupt state that is + # visible to the scrubber. + workload.write_rows(1, upload=False) + env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if + # the controller had not properly applied validation rules. + healthy, _summary = env.storage_scrubber.scan_metadata() + try: + log.info(f"scrubbed, healthy={healthy}") + assert healthy + except: + # On failures, we want to report them FAIL during the test, not as ERROR during teardown + neon_env_builder.enable_scrub_on_exit = False + raise + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + + fake_id = 5 + + target = env.storage_controller + + assert target.get_safekeeper(fake_id) is None + + body = { + "active": True, + "id": fake_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "safekeeper-333.us-east-2.aws.neon.build", + "port": 6401, + "http_port": 7676, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + + target.on_safekeeper_deploy(fake_id, body) + + inserted = target.get_safekeeper(fake_id) + assert inserted is not None + assert eq_safekeeper_records(body, inserted) + + # error out if pk is changed (unexpected) + with pytest.raises(StorageControllerApiException) as exc: + different_pk = dict(body) + different_pk["id"] = 4 + assert different_pk["id"] != body["id"] + target.on_safekeeper_deploy(fake_id, different_pk) + assert exc.value.status_code == 400 + + inserted_again = target.get_safekeeper(fake_id) + assert inserted_again is not None + assert eq_safekeeper_records(inserted, inserted_again) + + # the most common case, version goes up: + assert isinstance(body["version"], int) + body["version"] += 1 + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert inserted_now is not None + + assert eq_safekeeper_records(body, inserted_now) + + +def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: + compared = [dict(a), dict(b)] + + masked_keys = ["created_at", "updated_at"] + + for d in compared: + # keep deleting these in case we are comparing the body as it will be uploaded by real scripts + for key in masked_keys: + if key in d: + del d[key] + + return compared[0] == compared[1] + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): + def assign_az(ps_cfg): + az = f"az-{ps_cfg['id']}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tids = [TenantId.generate() for _ in range(0, 3)] + for tid in tids: + env.storage_controller.tenant_create(tid) + + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + attached_to = shards[0]["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + + assert shards[0]["preferred_az_id"] == expected_az + + updated = env.storage_controller.set_preferred_azs( + {TenantShardId(tid, 0, 0): "foo" for tid in tids} + ) + + assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) + + for tid in tids: + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + assert shards[0]["preferred_az_id"] == "foo" + + # Generate a layer to avoid shard split handling on ps from tripping + # up on debug assert. + timeline_id = TimelineId.generate() + env.neon_cli.create_timeline("bar", tids[0], timeline_id) + + workload = Workload(env, tids[0], timeline_id, branch_name="bar") + workload.init() + workload.write_rows(256) + workload.validate() + + env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + shards = env.storage_controller.tenant_describe(tids[0])["shards"] + assert len(shards) == 2 + for shard in shards: + attached_to = shard["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == expected_az diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 2844d1b1d2..848e214c5e 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -152,6 +152,9 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt # This write includes remote upload, will generate an index in this generation workload.write_rows(1) + # We will use a min_age_secs=1 threshold for deletion, let it pass + time.sleep(2) + # With a high min_age, the scrubber should decline to delete anything gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600) assert gc_summary["remote_storage_errors"] == 0 @@ -214,6 +217,13 @@ def test_scrubber_physical_gc_ancestors( workload.init() workload.write_rows(100) + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + for pre_split_shard in env.storage_controller.locate(tenant_id): + env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush( + execute=True + ) + new_shard_count = 4 assert shard_count is None or new_shard_count > shard_count shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) @@ -318,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder workload.write_rows(100, upload=False) workload.stop() + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True) + new_shard_count = 4 shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) for shard in shards: diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 4581008022..91caad7220 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -37,9 +37,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" - # synchronous_commit=on to test a hypothesis for why this test has been flaky. - # XXX: Add link to the issue - query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" scur.execute(query) time.sleep(2) # let initial table sync complete diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 10cb00c780..82075bd723 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -9,8 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # CLOG. def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_subxacts", "empty") - endpoint = env.endpoints.create_start("test_subxacts") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index dadf5ca672..7ee949e8d3 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,7 +1,9 @@ +import json from threading import Thread import pytest from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, @@ -9,14 +11,16 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, + many_small_layers_tenant_config, wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response def error_tolerant_delete(ps_http, tenant_id): @@ -76,7 +80,7 @@ def test_tenant_delete_smoke( env.neon_cli.create_tenant( tenant_id=tenant_id, - conf=MANY_SMALL_LAYERS_TENANT_CONFIG, + conf=many_small_layers_tenant_config(), ) # Default tenant and the one we created @@ -215,7 +219,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) # (and there is no way to reconstruct the used remote storage kind) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) ps_http = env.pageserver.http_client() tenant_id = env.initial_tenant @@ -322,7 +326,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) env.pageserver.stop() -def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder): """ Validate that creating and then deleting the tenant both survives the scrubber, and that one can run the scrubber without problems. @@ -330,7 +334,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) ps_http = env.pageserver.http_client() # create a tenant separate from the main tenant so that we have one remaining @@ -347,6 +351,45 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) healthy, _ = env.storage_scrubber.scan_metadata() assert healthy + timeline_lsns = { + "tenant_id": f"{tenant_id}", + "timeline_id": f"{timeline_id}", + "timeline_start_lsn": f"{last_flush_lsn}", + "backup_lsn": f"{last_flush_lsn}", + } + + cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/" + cloud_admin_token = "" + + def get_branches(request: Request): + # Compare definition with `BranchData` struct + dummy_data = { + "id": "test-branch-id", + "created_at": "", # TODO + "updated_at": "", # TODO + "name": "testbranchname", + "project_id": "test-project-id", + "timeline_id": f"{timeline_id}", + "default": False, + "deleted": False, + "logical_size": 42000, + "physical_size": 42000, + "written_size": 42000, + } + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"got get_branches request: {request.json}") + return Response(json.dumps(dummy_data), content_type="application/json", status=200) + + make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches) + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy + env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) @@ -354,3 +397,10 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) healthy, _ = env.storage_scrubber.scan_metadata() assert healthy + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0ebf714de0..b63ff7f6bd 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline - # Multiple creation requests which race will generate this error + # Multiple creation requests which race will generate this error on the pageserver + # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*") # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order. diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 840c7159ad..094dd20529 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -106,7 +106,7 @@ def test_threshold_based_eviction( # create a bunch of layers with env.endpoints.create_start("main", tenant_id=tenant_id) as pg: - pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()]) last_flush_lsn_upload(env, pg, tenant_id, timeline_id) # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint for sk in env.safekeepers: diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py new file mode 100644 index 0000000000..de43e51c9e --- /dev/null +++ b/test_runner/regress/test_timeline_archive.py @@ -0,0 +1,113 @@ +import pytest +from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.http import PageserverApiException + + +@pytest.mark.parametrize("shard_count", [0, 4]) +def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): + unsharded = shard_count == 0 + if unsharded: + env = neon_env_builder.init_start() + # If we run the unsharded version, talk to the pageserver directly + ps_http = env.pageserver.http_client() + else: + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + # If we run the unsharded version, talk to the storage controller + ps_http = env.storage_controller.pageserver_api() + + # first try to archive a non existing timeline for an existing tenant: + invalid_timeline_id = TimelineId.generate() + with pytest.raises(PageserverApiException, match="timeline not found") as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # for a non existing tenant: + invalid_tenant_id = TenantId.generate() + with pytest.raises( + PageserverApiException, + match="NotFound: [tT]enant", + ) as exc: + ps_http.timeline_archival_config( + invalid_tenant_id, + invalid_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 404 + + # construct a pair of branches to validate that pageserver prohibits + # archival of ancestor timelines when they have non-archived child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent") + + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent" + ) + + with pytest.raises( + PageserverApiException, + match="Cannot archive timeline which has non-archived child timelines", + ) as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + assert exc.value.status_code == 412 + + leaf_detail = ps_http.timeline_detail( + env.initial_tenant, + timeline_id=leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + # Test that archiving the leaf timeline and then the parent works + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + env.initial_tenant, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + # Test that the leaf can't be unarchived + with pytest.raises( + PageserverApiException, + match="ancestor is archived", + ) as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + # Unarchive works for the leaf if the parent gets unarchived first + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 6d96dda391..711fcd5016 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -16,9 +16,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( - MANY_SMALL_LAYERS_TENANT_CONFIG, assert_prefix_empty, assert_prefix_not_empty, + many_small_layers_tenant_config, poll_for_remote_storage_iterations, timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + parent_timeline_id = env.neon_cli.create_branch( + new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main" + ) leaf_timeline_id = env.neon_cli.create_branch( - "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + new_branch_name="test_ancestor_branch_delete_branch1", + ancestor_branch_name="test_ancestor_branch_delete_parent", ) timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) @@ -782,7 +785,7 @@ def test_timeline_delete_resumed_on_attach( remote_storage_kind = s3_storage() neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config()) tenant_id = env.initial_tenant diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d152d0f41f..f98b53d966 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from( truncated_layers = 0 elif branchpoint == Branchpoint.AFTER_L0: branch_at = Lsn(last_lsn + 8) + # make sure the branch point is not on a page header + if 0 < (branch_at.lsn_int % 8192) < 40: + branch_at += 40 rows = 8192 # as there is no 8 byte walrecord, nothing should get copied from the straddling layer truncated_layers = 0 diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 24de894687..ddfe9b911f 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -1,17 +1,32 @@ import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import List, Optional +import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LogCursor, NeonEnvBuilder, + NeonPageserver, ) from fixtures.pageserver.utils import wait_timeline_detail_404 -def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("sharded", [True, False]) +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): + neon_env_builder.num_pageservers = 2 if sharded else 1 env = neon_env_builder.init_start( - initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}, + initial_tenant_shard_count=2 if sharded else None, ) - ps = env.pageserver - http = ps.http_client() + + if sharded: + http = env.storage_controller.pageserver_api() + else: + http = env.pageserver.http_client() + + pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers))) foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) @@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): tenant_before = http.tenant_status(env.initial_tenant) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line) - - assert ps.log_contains(gc_skipped_line, offset) is None + pss.assert_log_contains(gc_active_line) + pss.assert_log_does_not_contain(gc_skipped_line) http.timeline_block_gc(env.initial_tenant, foo_branch) @@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) - ps.restart() - ps.quiesce_tenants() + pss.restart() + pss.quiesce_tenants() - _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + pss.assert_log_contains(init_gc_skipped) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) # deletion unblocks gc http.timeline_delete(env.initial_tenant, foo_branch) wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line, offset) + pss.assert_log_contains(gc_active_line) http.timeline_block_gc(env.initial_tenant, env.initial_timeline) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) # removing the manual block also unblocks gc http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line, offset) + pss.assert_log_contains(gc_active_line) def wait_for_another_gc_round(): time.sleep(2) + + +@dataclass +class ScrollableLog: + pageserver: NeonPageserver + offset: Optional[LogCursor] + + def assert_log_contains(self, what: str): + msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset) + old = self.offset + self.offset = offset + log.info(f"{old} -> {offset}: {msg}") + + def assert_log_does_not_contain(self, what: str): + assert self.pageserver.log_contains(what) is None + + +@dataclass(frozen=True) +class ManyPageservers: + many: List[ScrollableLog] + + def assert_log_contains(self, what: str): + for one in self.many: + one.assert_log_contains(what) + + def assert_log_does_not_contain(self, what: str): + for one in self.many: + one.assert_log_does_not_contain(what) + + def restart(self): + def do_restart(x: ScrollableLog): + x.pageserver.restart() + + with ThreadPoolExecutor(max_workers=len(self.many)) as rt: + rt.map(do_restart, self.many) + rt.shutdown(wait=True) + + def quiesce_tenants(self): + def do_quiesce(x: ScrollableLog): + x.pageserver.quiesce_tenants() + + with ThreadPoolExecutor(max_workers=len(self.many)) as rt: + rt.map(do_quiesce, self.many) + rt.shutdown(wait=True) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 642b9e449b..f2265dd3d9 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -36,7 +36,7 @@ from fixtures.utils import get_timeline_dir_size, wait_until def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # We will run with the limit set to 1, so that once we have one tenant stuck # in a pausable failpoint, the rest are prevented from proceeding through warmup. - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() @@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index dd76689008..ebe65e7c29 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,15 +1,20 @@ import os +from pathlib import Path +from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + fork_at_current_lsn, + import_timeline_from_vanilla_postgres, +) # # Test branching, when a transaction is in prepared state # -def test_twophase(neon_simple_env: NeonEnv): - env = neon_simple_env - env.neon_cli.create_branch("test_twophase", "empty") +def twophase_test_on_timeline(env: NeonEnv): endpoint = env.endpoints.create_start( "test_twophase", config_lines=["max_prepared_transactions=5"] ) @@ -17,6 +22,11 @@ def test_twophase(neon_simple_env: NeonEnv): conn = endpoint.connect() cur = conn.cursor() + # FIXME: Switch to the next WAL segment, to work around the bug fixed in + # https://github.com/neondatabase/neon/pull/8914. When that is merged, this can be + # removed. + cur.execute("select pg_switch_wal()") + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row @@ -83,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv): # Only one committed insert is visible on the original branch cur.execute("SELECT * FROM foo") assert cur.fetchall() == [("three",)] + + +def test_twophase(neon_simple_env: NeonEnv): + """ + Test branching, when a transaction is in prepared state + """ + env = neon_simple_env + env.neon_cli.create_branch("test_twophase") + + twophase_test_on_timeline(env) + + +def test_twophase_nonzero_epoch( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs + have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in + pg_twophase filenames with PostgreSQL v17.) + """ + env = neon_simple_env + + # Reset the vanilla Postgres instance with a higher XID epoch + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + env.initial_tenant, + timeline_id, + "test_twophase", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() # don't need the original server anymore + + twophase_test_on_timeline(env) diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 137d28b9fa..deba29536c 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -9,8 +9,7 @@ from fixtures.pg_version import PgVersion # def test_unlogged(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_unlogged", "empty") - endpoint = env.endpoints.create_start("test_unlogged") + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver - fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") + fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main") endpoint2 = env.endpoints.create_start("test_unlogged_basebackup") conn2 = endpoint2.connect() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 7272979c4a..3075211ada 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -13,8 +13,7 @@ from fixtures.utils import query_scalar def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_vm_bit_clear", "empty") - endpoint = env.endpoints.create_start("test_vm_bit_clear") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1") # Branch at this point, to test that later - fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 19df834b81..4bf8cfe88f 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -19,7 +19,6 @@ import psycopg2.errors import psycopg2.extras import pytest import requests -from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics @@ -72,6 +71,17 @@ def wait_lsn_force_checkpoint( wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + def wait_lsn_force_checkpoint_at( lsn: Lsn, tenant_id: TenantId, @@ -79,6 +89,10 @@ def wait_lsn_force_checkpoint_at( ps: NeonPageserver, pageserver_conn_options=None, ): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ pageserver_conn_options = pageserver_conn_options or {} auth_token = None @@ -1042,6 +1056,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): endpoint.start() +# Try restarting endpoint immediately after xlog switch. +# https://github.com/neondatabase/neon/issues/8911 +def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + endpoint.safe_psql("create table t (i int)") + + endpoint.safe_psql("SELECT pg_switch_wal()") + + # we want immediate shutdown to have endpoint restart on xlog switch record, + # so prevent shutdown checkpoint. + endpoint.stop(mode="immediate") + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("SELECT 'works'") + + # Context manager which logs passed time on exit. class DurationLogger: def __init__(self, desc): @@ -1406,11 +1438,7 @@ class SafekeeperEnv: ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = NeonBroker( - logfile=Path(self.repo_dir) / "storage_broker.log", - port=self.port_distributor.get_port(), - neon_binpath=neon_binpath, - ) + self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}" self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") @@ -1459,7 +1487,7 @@ class SafekeeperEnv: "--id", str(i), "--broker-endpoint", - self.broker.client_url(), + self.fake_broker_endpoint, ] log.info(f'Running command "{" ".join(cmd)}"') @@ -2161,6 +2189,43 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder): assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" +def test_term_bump(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + # initialize safekeeper + endpoint.safe_psql("create table t(key int, value text)") + + http_cli = env.safekeepers[0].http_client() + + # check that bump up to specific term works + curr_term = http_cli.timeline_status(tenant_id, timeline_id).term + bump_to = curr_term + 3 + res = http_cli.term_bump(tenant_id, timeline_id, bump_to) + log.info(f"bump to {bump_to} res: {res}") + assert res.current_term >= bump_to + + # check that bump to none increments current term + res = http_cli.term_bump(tenant_id, timeline_id, None) + log.info(f"bump to None res: {res}") + assert res.current_term > bump_to + assert res.current_term > res.previous_term + + # check that bumping doesn't work downward + res = http_cli.term_bump(tenant_id, timeline_id, 2) + log.info(f"bump to 2 res: {res}") + assert res.current_term > bump_to + assert res.current_term == res.previous_term + + # check that this doesn't kill endpoint because last WAL flush was his and + # thus its basebackup is still good + endpoint.safe_psql("insert into t values (1, 'payload')") + + # Test disables periodic pushes from safekeeper to the broker and checks that # pageserver can still discover safekeepers with discovery requests. def test_broker_discovery(neon_env_builder: NeonEnvBuilder): @@ -2330,6 +2395,77 @@ def test_s3_eviction( assert event_metrics_seen +# Test resetting uploaded partial segment state. +def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + # We want to upload/evict quickly, but not too quickly to check that s3 is + # empty before next round of upload happens. + # Note: this test fails with --delete-offloaded-wal, this is expected. + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "1s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident=1s", + ] + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Make + # value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + } + env = neon_env_builder.init_start(initial_tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create("main") + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.stop() + sk = env.safekeepers[0] + # eviction won't happen until remote_consistent_lsn catches up. + wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver) + + http_cli = env.safekeepers[0].http_client() + + # wait until eviction happens + def evicted(): + eviction_state = http_cli.get_eviction_state(timeline_id) + log.info(f"eviction_state: {eviction_state}") + if isinstance(eviction_state, str) and eviction_state == "Present": + raise Exception("eviction didn't happen yet") + + wait_until(30, 1, evicted) + # it must have uploaded something + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments before reset: {uploaded_segs}") + assert len(uploaded_segs) > 0 + + reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id) + log.info(f"reset res: {reset_res}") + + # Backup_partial_reset must have reset the state and dropped s3 segment. + # + # Note: if listing takes more than --partial-backup-timeout test becomes + # flaky because file might be reuploaded. With local fs it shouldn't be an + # issue, but can add retry if this appears. + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments after reset: {uploaded_segs}") + assert len(uploaded_segs) == 0 + + # calling second time should be ok + http_cli.backup_partial_reset(tenant_id, timeline_id) + + # inserting data should be ok + endpoint.start() + endpoint.safe_psql("insert into t values(1, 'hehe')") + + def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder): """ Verify that pulling timeline from a SK with an uploaded partial segment @@ -2357,7 +2493,16 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde "--eviction-min-resident=500ms", ] - env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"}) + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Until + # this is fixed make value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + "checkpoint_timeout": "100ms", + } + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -2421,7 +2566,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde endpoint.start(safekeepers=[2, 3]) def new_partial_segment_uploaded(): - segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id) for seg in segs: if "partial" in seg and "sk3" in seg: return seg diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 6582b34218..229d3efd8e 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -62,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil elements_to_insert = 1_000_000 expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout + # => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed + # ==> we log this error + env.pageserver.allowed_errors.append( + ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" + ) insert_test_elements(env, tenant_id, start=0, count=elements_to_insert) diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 7e8aef5a5f..d710b53528 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -23,8 +23,7 @@ run_broken = pytest.mark.skipif( def test_broken(neon_simple_env: NeonEnv, pg_bin): env = neon_simple_env - env.neon_cli.create_branch("test_broken", "empty") - env.endpoints.create_start("test_broken") + env.endpoints.create_start("main") log.info("postgres is running") log.info("THIS NEXT COMMAND WILL FAIL:") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 3fd7a45f8a..a317b9b5b9 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737 +Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 46b4b235f3..6f6d77fb59 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674 +Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 47a9122a5a..0baa7346df 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a +Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 new file mode 160000 index 0000000000..9156d63ce2 --- /dev/null +++ b/vendor/postgres-v17 @@ -0,0 +1 @@ +Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa diff --git a/vendor/revisions.json b/vendor/revisions.json index 6e3e489b5d..c2c34962bb 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,14 +1,18 @@ { + "v17": [ + "17rc1", + "9156d63ce253bed9d1f76355ceec610e444eaffa" + ], "v16": [ - "16.3", - "47a9122a5a150a3217fafd3f3d4fe8e020ea718a" + "16.4", + "0baa7346dfd42d61912eeca554c9bb0a190f0a1e" ], "v15": [ - "15.7", - "46b4b235f38413ab5974bb22c022f9b829257674" + "15.8", + "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9" ], "v14": [ - "14.12", - "3fd7a45f8aae85c080df6329e3c85887b7f3a737" + "14.13", + "a317b9b5b96978b49e78986697f3dd80d06f99a7" ] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 622004b931..c94f95f447 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -326,15 +326,13 @@ files: SELECT checkpoints_timed FROM pg_stat_bgwriter; - metric_name: compute_logical_snapshot_files - type: guage + type: gauge help: 'Number of snapshot files in pg_logical/snapshot' key_labels: - - tenant_id - timeline_id values: [num_logical_snapshot_files] query: | SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id, (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These -- temporary snapshot files are renamed to the actual snapshot files after they are @@ -356,6 +354,17 @@ files: from pg_replication_slots where slot_type = 'logical'; + - metric_name: compute_subscriptions_count + type: gauge + help: 'Number of logical replication subscriptions grouped by enabled/disabled' + key_labels: + - enabled + values: [subscriptions_count] + query: | + select subenabled::text as enabled, count(*) as subscriptions_count + from pg_subscription + group by subenabled; + - metric_name: retained_wal type: gauge help: 'Retained WAL in inactive replication slots' diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 20693ad63d..662916d42c 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -8,6 +8,8 @@ version = "0.1.0" description = "workspace-hack package, managed by hakari" # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. publish = false +edition.workspace = true +license.workspace = true # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments # are managed by hakari. @@ -36,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt", digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } @@ -47,7 +50,8 @@ hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -57,7 +61,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -80,10 +84,13 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } +toml_edit = { version = "0.22", features = ["serde"] } tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } @@ -101,7 +108,8 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -111,7 +119,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } proc-macro2 = { version = "1" } prost = { version = "0.11" } quote = { version = "1" } @@ -122,6 +130,7 @@ serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_edit = { version = "0.22", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }