diff --git a/.dockerignore b/.dockerignore index 7ead48db7c..9fafc2e4ba 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,6 +14,7 @@ !compute/ !compute_tools/ !control_plane/ +!docker-compose/ext-src !libs/ !pageserver/ !pgxn/ diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d07e3e32e8..b85ca7874d 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -38,9 +38,11 @@ runs: # - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BUCKET: neon-github-public-dev run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -59,8 +61,6 @@ runs: echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV echo "BUCKET=${BUCKET}" >> $GITHUB_ENV - env: - BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - uses: actions/setup-java@v4 @@ -80,8 +80,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.27.0 - ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 + ALLURE_VERSION: 2.32.2 + ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953 - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 8548a886cf..687bfd49af 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -18,9 +18,11 @@ runs: steps: - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + REPORT_DIR: ${{ inputs.report-dir }} run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -32,8 +34,6 @@ runs: echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV - env: - REPORT_DIR: ${{ inputs.report-dir }} - uses: aws-actions/configure-aws-credentials@v4 if: ${{ !cancelled() }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 0eddfe5da6..122fe48b68 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -236,5 +236,5 @@ runs: uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results - unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} + unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }} aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py similarity index 98% rename from scripts/generate_image_maps.py rename to .github/scripts/generate_image_maps.py index 915eb33673..39ece5b38f 100644 --- a/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -27,6 +27,7 @@ components = { registries = { "dev": [ "docker.io/neondatabase", + "ghcr.io/neondatabase", f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq new file mode 100644 index 0000000000..b0b00bce18 --- /dev/null +++ b/.github/scripts/previous-releases.jq @@ -0,0 +1,25 @@ +# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input, +# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases. +# Extract only the `tag_name` field from each release object +[ .[].tag_name ] + +# Transform each tag name into a structured object using regex capture +| reduce map( + capture("^(?release(-(?proxy|compute))?-(?\\d+))$") + | { + component: (.component // "storage"), # Default to "storage" if no component is specified + version: (.version | tonumber), # Convert the version number to an integer + full: .full # Store the full tag name for final output + } + )[] as $entry # Loop over the transformed list + +# Accumulate the latest (highest-numbered) version for each component +({}; + .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) + +# Convert the resulting object into an array of formatted strings +| to_entries +| map("\(.key)=\(.value.full)") + +# Output each string separately +| .[] diff --git a/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py similarity index 100% rename from scripts/push_with_image_map.py rename to .github/scripts/push_with_image_map.py diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3740e6dc9c..30fde127b0 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -337,7 +337,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }} + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml new file mode 100644 index 0000000000..ac09a51ae9 --- /dev/null +++ b/.github/workflows/_meta.yml @@ -0,0 +1,103 @@ +name: Generate run metadata +on: + workflow_call: + inputs: + github-event-name: + type: string + required: true + outputs: + build-tag: + description: "Tag for the current workflow run" + value: ${{ jobs.tags.outputs.build-tag }} + previous-storage-release: + description: "Tag of the last storage release" + value: ${{ jobs.tags.outputs.storage }} + previous-proxy-release: + description: "Tag of the last proxy release" + value: ${{ jobs.tags.outputs.proxy }} + previous-compute-release: + description: "Tag of the last compute release" + value: ${{ jobs.tags.outputs.compute }} + run-kind: + description: "The kind of run we're currently in. Will be one of `pr-main`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`" + value: ${{ jobs.tags.outputs.run-kind }} + +permissions: {} + +jobs: + tags: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + compute: ${{ steps.previous-releases.outputs.compute }} + proxy: ${{ steps.previous-releases.outputs.proxy }} + storage: ${{ steps.previous-releases.outputs.storage }} + run-kind: ${{ steps.run-kind.outputs.run-kind }} + permissions: + contents: read + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get run kind + id: run-kind + env: + RUN_KIND: >- + ${{ + false + || (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main' + || (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'main') && 'pr-main' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr' + || 'unknown' + }} + run: | + echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT + + - name: Get build tag + id: build-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + RUN_KIND: ${{ steps.run-kind.outputs.run-kind }} + run: | + case $RUN_KIND in + push-main) + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + storage-release) + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + proxy-release) + echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + compute-release) + echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + pr-main|storage-rc-pr|compute-rc-pr|proxy-rc-pr) + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + *) + echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" + exit 1 + esac + + - name: Get the previous release-tags + id: previous-releases + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases" \ + | jq -f .github/scripts/previous-releases.jq -r \ + | tee -a "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index c938f62ad5..2dab665f40 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -11,8 +11,12 @@ on: description: AWS region to log in to. Required when pushing to ECR. required: false type: string - aws-account-ids: - description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR. + aws-account-id: + description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + aws-role-to-assume: + description: AWS role to assume to for pushing to ECR. Required when pushing to ECR. required: false type: string azure-client-id: @@ -31,16 +35,6 @@ on: description: ACR registry name. Required when pushing to ACR. required: false type: string - secrets: - docker-hub-username: - description: Docker Hub username. Required when pushing to Docker Hub. - required: false - docker-hub-password: - description: Docker Hub password. Required when pushing to Docker Hub. - required: false - aws-role-to-assume: - description: AWS role to assume. Required when pushing to ECR. - required: false permissions: {} @@ -53,10 +47,11 @@ jobs: runs-on: ubuntu-22.04 permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/push_with_image_map.py + sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false - name: Print image-map @@ -67,14 +62,14 @@ jobs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: "${{ inputs.aws-region }}" - role-to-assume: "${{ secrets.aws-role-to-assume }}" + role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" role-duration-seconds: 3600 - name: Login to ECR if: contains(inputs.image-map, 'amazonaws.com/') uses: aws-actions/amazon-ecr-login@v2 with: - registries: "${{ inputs.aws-account-ids }}" + registries: "${{ inputs.aws-account-id }}" - name: Configure Azure credentials if: contains(inputs.image-map, 'azurecr.io/') @@ -89,13 +84,21 @@ jobs: run: | az acr login --name=${{ inputs.acr-registry-name }} + - name: Login to GHCR + if: contains(inputs.image-map, 'ghcr.io/') + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub uses: docker/login-action@v3 with: - username: ${{ secrets.docker-hub-username }} - password: ${{ secrets.docker-hub-password }} + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries - run: python scripts/push_with_image_map.py + run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index b36ac46f35..dd4941bb21 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -398,6 +398,9 @@ jobs: runs-on: ${{ matrix.runner }} container: image: ${{ matrix.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init # Increase timeout to 8h, default timeout is 6h diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8f3392ceea..1fd4f292e8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -65,38 +65,11 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} filters: .github/file-filters.yaml - tag: + meta: needs: [ check-permissions ] - runs-on: [ self-hosted, small ] - container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - run: | - echo run:$GITHUB_RUN_ID - echo ref:$GITHUB_REF_NAME - echo rev:$(git rev-list --count HEAD) - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" - echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT - fi - shell: bash - id: build-tag + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ github.event_name }} build-build-tools-image: needs: [ check-permissions ] @@ -199,7 +172,7 @@ jobs: secrets: inherit build-and-test-locally: - needs: [ tag, build-build-tools-image ] + needs: [ meta, build-build-tools-image ] strategy: fail-fast: false matrix: @@ -213,7 +186,7 @@ jobs: with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - build-tag: ${{ needs.tag.outputs.build-tag }} + build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. @@ -497,13 +470,24 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] + # Depends on jobs that can get skipped + if: >- + ${{ + ( + !github.event.pull_request.draft + || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + || contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) + ) && !failure() && !cancelled() + }} + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml + with: + github-event-name: ${{ github.event_name }} secrets: inherit neon-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] @@ -539,7 +523,7 @@ jobs: build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm provenance: false @@ -549,10 +533,11 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: - needs: [ neon-image-arch, tag ] + needs: [ neon-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -567,13 +552,14 @@ jobs: - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -631,7 +617,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -641,7 +627,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -651,7 +637,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -661,10 +647,11 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: - needs: [ compute-node-image-arch, tag ] + needs: [ compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -692,21 +679,22 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 vm-compute-node-image: - needs: [ check-permissions, tag, compute-node-image ] + needs: [ check-permissions, meta, compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, large ] strategy: fail-fast: false @@ -722,14 +710,14 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.37.1 + VM_BUILDER_VERSION: v0.42.2 steps: - uses: actions/checkout@v4 - name: Downloading vm-builder run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -742,22 +730,25 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -target-arch=linux/amd64 - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image ] + needs: [ check-permissions, meta, neon-image, compute-node-image ] + # Depends on jobs that can get skipped + if: "!failure() && !cancelled()" strategy: fail-fast: false matrix: @@ -775,17 +766,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Get the last compute release tag - id: get-last-compute-release-tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ - -H "Accept: application/vnd.github+json" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "/repos/${{ github.repository }}/releases") - echo tag=${tag} >> ${GITHUB_OUTPUT} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -795,8 +775,9 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages + if: ${{ contains(fromJSON('["push-main", "pr-main", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -813,7 +794,24 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 env: - TAG: ${{needs.tag.outputs.build-tag}} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} + TEST_EXTENSIONS_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && 'latest' + || needs.meta.outputs.build-tag + }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh @@ -825,10 +823,17 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 - if: ${{ needs.tag.outputs.build-tag == github.run_id }} + if: ${{ contains(fromJSON('["pr-main", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} env: - NEWTAG: ${{ needs.tag.outputs.build-tag }} - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TAG: >- + ${{ + false + || needs.meta.outputs.run-kind == 'pr-main' && needs.meta.outputs.build-tag + || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release + }} + TEST_EXTENSIONS_TAG: latest + NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} + OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up @@ -838,7 +843,7 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down generate-image-maps: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 outputs: neon-dev: ${{ steps.generate.outputs.neon-dev }} @@ -848,14 +853,14 @@ jobs: steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/generate_image_maps.py + sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false - name: Generate Image Maps id: generate - run: python scripts/generate_image_maps.py + run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BUILD_TAG: "${{ needs.meta.outputs.build-tag }}" BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -864,88 +869,93 @@ jobs: AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: - needs: [ generate-image-maps, neon-image ] + needs: [ meta, generate-image-maps, neon-image ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-dev: - needs: [ generate-image-maps, vm-compute-node-image ] + needs: [ meta, generate-image-maps, vm-compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-neon-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, neon-image, test-images ] + needs: [ meta, generate-image-maps, neon-image, test-images ] + if: ${{ contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, vm-compute-node-image, test-images ] + needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ] + if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit # This is a bit of a special case so we're not using a generated image map. add-latest-tag-to-neon-extensions-test-image: if: github.ref_name == 'main' - needs: [ tag, compute-node-image ] + needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] } - secrets: - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit trigger-custom-extensions-build-and-wait: - needs: [ check-permissions, tag ] + needs: [ check-permissions, meta ] + if: ${{ contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -980,7 +990,7 @@ jobs: \"ci_job_name\": \"build-and-upload-extensions\", \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\", \"remote_branch_name\": \"${{ github.ref_name }}\" } }" @@ -1024,9 +1034,9 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() + needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` + if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1037,108 +1047,103 @@ jobs: - uses: actions/checkout@v4 - name: Create git tag and GitHub release - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} uses: actions/github-script@v7 + env: + TAG: "${{ needs.meta.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + PREVIOUS_RELEASE: >- + ${{ + false + || needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release + || needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release + || 'unknown' + }} with: retries: 5 script: | - const tag = "${{ needs.tag.outputs.build-tag }}"; - const branch = "${{ github.ref_name }}"; + const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env try { const existingRef = await github.rest.git.getRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `tags/${tag}`, + ref: `tags/${TAG}`, }); if (existingRef.data.object.sha !== context.sha) { - throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); + throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); } - console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`); + console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Tag ${tag} does not exist. Creating it...`); + console.log(`Tag ${TAG} does not exist. Creating it...`); await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `refs/tags/${tag}`, + ref: `refs/tags/${TAG}`, sha: context.sha, }); - console.log(`Tag ${tag} created successfully.`); + console.log(`Tag ${TAG} created successfully.`); } try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, repo: context.repo.repo, - tag: tag, + tag: TAG, }); - console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`); + console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Release for tag ${tag} does not exist. Creating it...`); + console.log(`Release for tag ${TAG} does not exist. Creating it...`); // Find the PR number using the commit SHA const pullRequests = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'closed', - base: branch, + base: BRANCH, }); const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); const prNumber = pr ? pr.number : null; - // Find the previous release on the branch - const releases = await github.rest.repos.listReleases({ - owner: context.repo.owner, - repo: context.repo.repo, - per_page: 100, - }); - - const branchReleases = releases.data - .filter((release) => { - const regex = new RegExp(`^${branch}-\\d+$`); - return regex.test(release.tag_name) && !release.draft && !release.prerelease; - }) - .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); - - const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; - const releaseNotes = [ prNumber ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` : 'Release PR not found.', - previousTag - ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` - : `No previous release found on branch ${branch}.`, + `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.` ].join('\n\n'); await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, - tag_name: tag, + tag_name: TAG, body: releaseNotes, }); - console.log(`Release for tag ${tag} created successfully.`); + console.log(`Release for tag ${TAG} created successfully.`); } - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + RUN_KIND: ${{ needs.meta.outputs.run-kind }} run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + case ${RUN_KIND} in + push-main) + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false + ;; + storage-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ @@ -1146,7 +1151,7 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ @@ -1154,8 +1159,9 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + proxy-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ @@ -1163,7 +1169,7 @@ jobs: -f deployStorageBroker=false \ -f deployStorageController=false \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ @@ -1173,13 +1179,16 @@ jobs: -f deployProxyScram=true \ -f deployProxyAuthBroker=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + compute-release) + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + *) + echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'" exit 1 - fi + ;; + esac notify-storage-release-deploy-failure: needs: [ deploy ] @@ -1204,7 +1213,7 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read - # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 @@ -1294,7 +1303,8 @@ jobs: pin-build-tools-image: needs: [ build-build-tools-image, test-images, build-and-test-locally ] - if: github.ref_name == 'main' + # `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped + if: github.ref_name == 'main' && !failure() && !cancelled() uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} @@ -1313,6 +1323,7 @@ jobs: # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: + - meta - build-and-test-locally - check-codestyle-python - check-codestyle-rust @@ -1336,7 +1347,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.push-compute-image-dev.result == 'skipped' - || needs.push-neon-image-dev.result == 'skipped' + || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr-main", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.test-images.result == 'skipped' - || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' + || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr-main", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 09d6acd325..606e1c0862 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -38,6 +38,9 @@ jobs: runs-on: us-east-2 container: image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index f077e04d1c..90318747b3 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,7 +71,7 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index b305b662ee..d2588ba0bf 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -65,6 +65,7 @@ jobs: permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR uses: ./.github/workflows/_push-to-container-registry.yml with: @@ -72,12 +73,15 @@ jobs: { "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", + "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", + "ghcr.io/neondatabase/build-tools:pinned-bookworm", + "ghcr.io/neondatabase/build-tools:pinned", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", @@ -85,12 +89,10 @@ jobs: ] } aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index be6a7a7901..a30da35502 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -5,6 +5,10 @@ on: types: - ready_for_review workflow_call: + inputs: + github-event-name: + type: string + required: true defaults: run: @@ -19,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name }} + github-event-name: ${{ inputs.github-event-name || github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -35,46 +39,29 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" - tag: - needs: [ check-permissions ] - runs-on: ubuntu-22.04 - outputs: - build-tag: ${{ steps.build-tag.outputs.tag }} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} - CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') - echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT - fi - id: build-tag + meta: + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ inputs.github-event-name || github.event_name }} trigger-e2e-tests: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 env: EVENT_ACTION: ${{ github.event.action }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - TAG: ${{ needs.tag.outputs.build-tag }} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} steps: - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely @@ -157,6 +144,6 @@ jobs: --raw-field "commit_hash=$COMMIT_SHA" \ --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ --raw-field "storage_image_tag=${TAG}" \ - --raw-field "compute_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${COMPUTE_TAG}" \ --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/Cargo.lock b/Cargo.lock index 12232eaece..7d11f2b7fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1342,7 +1342,9 @@ dependencies = [ "tokio-util", "tower 0.5.2", "tower-http", + "tower-otel", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -1546,6 +1548,17 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "cron" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740" +dependencies = [ + "chrono", + "once_cell", + "winnow", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1874,6 +1887,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -3331,6 +3350,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "9.2.0" @@ -4155,7 +4185,6 @@ dependencies = [ "pageserver_client", "pageserver_compaction", "pin-project-lite", - "postgres", "postgres-protocol", "postgres-types", "postgres_backend", @@ -4242,7 +4271,6 @@ dependencies = [ "futures", "http-utils", "pageserver_api", - "postgres", "reqwest", "serde", "thiserror 1.0.69", @@ -4458,18 +4486,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", @@ -4657,7 +4685,6 @@ dependencies = [ "anyhow", "itertools 0.10.5", "once_cell", - "postgres", "tokio-postgres", "url", ] @@ -5799,7 +5826,6 @@ dependencies = [ "once_cell", "pageserver_api", "parking_lot 0.12.1", - "postgres", "postgres-protocol", "postgres_backend", "postgres_ffi", @@ -6433,6 +6459,7 @@ dependencies = [ "chrono", "clap", "control_plane", + "cron", "diesel", "diesel-async", "diesel_migrations", @@ -6443,6 +6470,7 @@ dependencies = [ "humantime", "hyper 0.14.30", "itertools 0.10.5", + "json-structural-diff", "lasso", "measured", "metrics", @@ -7268,6 +7296,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" +[[package]] +name = "tower-otel" +version = "0.2.0" +source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +dependencies = [ + "http 1.1.0", + "opentelemetry", + "pin-project", + "tower-layer", + "tower-service", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tower-service" version = "0.3.3" @@ -7616,6 +7658,7 @@ dependencies = [ "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -8123,9 +8166,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.13" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 7228623c6b..223ff4249e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,6 +77,7 @@ byteorder = "1.4" bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" +cron = "0.15" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } clashmap = { version = "1.0", features = ["raw-api"] } @@ -192,6 +193,10 @@ toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["request-id", "trace"] } + +# This revision uses opentelemetry 0.27. There's no tag for it. +tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } + tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" @@ -210,6 +215,7 @@ rustls-native-certs = "0.8" x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } +json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing env_logger = "0.10" diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 317eded26e..c103ceaea5 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.84.1 +ENV RUSTC_VERSION=1.85.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 0b3001613d..0cdb44853f 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -395,15 +395,22 @@ RUN case "${PG_VERSION:?}" in \ cd plv8-src && \ if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi -FROM pg-build AS plv8-build +# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use +# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. +# (The V8 engine takes a very long time to build) +FROM build-deps AS plv8-build ARG PG_VERSION +WORKDIR /ext-src/plv8-src RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build python3-dev libncurses5 binutils clang \ && apt clean && rm -rf /var/lib/apt/lists/* - COPY --from=plv8-src /ext-src/ /ext-src/ -WORKDIR /ext-src/plv8-src +RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8 + +# Step 2: Build the PostgreSQL-dependent parts +COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN \ # generate and copy upgrade scripts make generate_upgrades && \ @@ -1451,9 +1458,11 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src +COPY compute/patches/duckdb_v113.patch . RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh @@ -1473,6 +1482,7 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_duckdb-src WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . +COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules # allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() @@ -1480,7 +1490,9 @@ COPY compute/patches/pg_duckdb_v031.patch . RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ cd pg_duckdb-src && \ git submodule update --init --recursive && \ - patch -p1 < /ext-src/pg_duckdb_v031.patch + patch -p1 < /ext-src/pg_duckdb_v031.patch && \ + cd third_party/duckdb && \ + patch -p1 < /ext-src/duckdb_v120.patch FROM pg-build AS pg_duckdb-build ARG PG_VERSION @@ -1806,7 +1818,7 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute FROM pg-build AS extension-tests ARG PG_VERSION -RUN mkdir /ext-src +COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres #COPY --from=postgis-src /ext-src/ /ext-src/ @@ -1844,7 +1856,10 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ #COPY --from=pg_mooncake-src /ext-src/ /ext-src/ -#COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/pg_repack.patch /ext-src +RUN cd /ext-src/pg_repack-src && patch -p1 ++ duckdb_pg_duckdb PUBLIC $ + $) + + target_include_directories( +@@ -161,7 +163,7 @@ else() + endif() + + install( +- TARGETS duckdb duckdb_static ++ TARGETS duckdb_pg_duckdb duckdb_static + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch index a7e188d69e..edc7fbf69d 100644 --- a/compute/patches/pg_duckdb_v031.patch +++ b/compute/patches/pg_duckdb_v031.patch @@ -1,3 +1,25 @@ +diff --git a/Makefile b/Makefile +index 3235cc8..6b892bc 100644 +--- a/Makefile ++++ b/Makefile +@@ -32,7 +32,7 @@ else + DUCKDB_BUILD_TYPE = release + endif + +-DUCKDB_LIB = libduckdb$(DLSUFFIX) ++DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX) + FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) + + ERROR_ON_WARNING ?= +@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} - + # changes to the vendored code in one place. + override PG_CFLAGS += -Wno-declaration-after-statement + +-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 ++SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4 + + include Makefile.global + diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index d777d76..af60106 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch new file mode 100644 index 0000000000..f6b0aa1e13 --- /dev/null +++ b/compute/patches/pg_repack.patch @@ -0,0 +1,72 @@ +diff --git a/regress/Makefile b/regress/Makefile +index bf6edcb..89b4c7f 100644 +--- a/regress/Makefile ++++ b/regress/Makefile +@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\} + # Test suite + # + +-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger ++REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger + + USE_PGXS = 1 # use pgxs if not in contrib directory + PGXS := $(shell $(PG_CONFIG) --pgxs) +diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out +index 8d0a94e..63b68bf 100644 +--- a/regress/expected/nosuper.out ++++ b/regress/expected/nosuper.out +@@ -4,22 +4,22 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + INFO: repacking table "public.tbl_cluster" + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + ERROR: pg_repack failed with error: You must be a superuser to use pg_repack + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + ERROR: pg_repack failed with error: ERROR: permission denied for schema repack + LINE 1: select repack.version(), repack.version_sql() + ^ + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + INFO: repacking table "public.tbl_cluster" + ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block + DETAIL: query was: RESET lock_timeout +diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql +index 072f0fa..dbe60f8 100644 +--- a/regress/sql/nosuper.sql ++++ b/regress/sql/nosuper.sql +@@ -4,19 +4,19 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper; + REVOKE USAGE ON SCHEMA repack FROM nosuper; diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 568f0b0444..ff4c3387d9 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -44,6 +44,11 @@ shutdownHook: | files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 6617c98599..c001040bc9 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -44,6 +44,11 @@ shutdownHook: | files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c276996df5..8f3bcbeef8 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_tools" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -46,7 +46,9 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tower-otel.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a8803ec793..6dae1a2753 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -40,35 +40,33 @@ use std::path::Path; use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; -use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::time::SystemTime; -use std::{thread, time::Duration}; +use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc}; +use std::thread; +use std::time::Duration; use anyhow::{Context, Result}; use chrono::Utc; use clap::Parser; -use compute_tools::disk_quota::set_disk_quota; -use compute_tools::http::server::Server; -use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; -use signal_hook::consts::{SIGQUIT, SIGTERM}; -use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info, warn}; -use url::Url; - use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; use compute_api::spec::ComputeSpec; - use compute_tools::compute::{ - forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, + ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal, }; use compute_tools::configurator::launch_configurator; +use compute_tools::disk_quota::set_disk_quota; use compute_tools::extension_server::get_pg_version_string; +use compute_tools::http::server::Server; use compute_tools::logger::*; +use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; -use rlimit::{setrlimit, Resource}; +use rlimit::{Resource, setrlimit}; +use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; +use signal_hook::iterator::Signals; +use tracing::{error, info, warn}; +use url::Url; use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes @@ -86,19 +84,6 @@ fn parse_remote_ext_config(arg: &str) -> Result { } } -/// Generate a compute ID if one is not supplied. This exists to keep forward -/// compatibility tests working, but will be removed in a future iteration. -fn generate_compute_id() -> String { - let now = SystemTime::now(); - - format!( - "compute-{}", - now.duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs() - ) -} - #[derive(Parser)] #[command(rename_all = "kebab-case")] struct Cli { @@ -112,16 +97,13 @@ struct Cli { /// outside the compute will talk to the compute through this port. Keep /// the previous name for this argument around for a smoother release /// with the control plane. - /// - /// TODO: Remove the alias after the control plane release which teaches the - /// control plane about the renamed argument. - #[arg(long, alias = "http-port", default_value_t = 3080)] + #[arg(long, default_value_t = 3080)] pub external_http_port: u16, - /// The port to bind the internal listening HTTP server to. Clients like + /// The port to bind the internal listening HTTP server to. Clients include /// the neon extension (for installing remote extensions) and local_proxy. - #[arg(long)] - pub internal_http_port: Option, + #[arg(long, default_value_t = 3081)] + pub internal_http_port: u16, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -156,7 +138,7 @@ struct Cli { #[arg(short = 'S', long, group = "spec-path")] pub spec_path: Option, - #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())] + #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] @@ -166,6 +148,8 @@ struct Cli { fn main() -> Result<()> { let cli = Cli::parse(); + let scenario = failpoint_support::init(); + // For historical reasons, the main thread that processes the spec and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) @@ -177,8 +161,6 @@ fn main() -> Result<()> { let build_tag = runtime.block_on(init())?; - let scenario = failpoint_support::init(); - // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -359,7 +341,7 @@ fn wait_spec( pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1), + internal_http_port: cli.internal_http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -383,7 +365,7 @@ fn wait_spec( // The internal HTTP server could be launched later, but there isn't much // sense in waiting. - Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute); + Server::Internal(cli.internal_http_port).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. @@ -424,6 +406,21 @@ fn start_postgres( ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + let _this_entered = { + // Temporarily enter the /configure request's span, so that the new span + // becomes its child. + let _parent_entered = state.startup_span.take().map(|p| p.entered()); + + tracing::info_span!("start_postgres") + } + .entered(); + state.set_status(ComputeStatus::Init, &compute.state_changed); info!( diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 585f3e4e1d..47558be7a0 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,13 +25,13 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; +use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; use nix::unistd::Pid; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] @@ -558,7 +558,9 @@ async fn cmd_dumprestore( decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) .await? } else { - bail!("destination connection string must be provided in spec for dump_restore command"); + bail!( + "destination connection string must be provided in spec for dump_restore command" + ); }; (source, dest) diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 1be10b36d6..d8d007da71 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,11 +1,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use tokio::task::JoinSet; +use tracing::{info, warn}; use walkdir::WalkDir; use super::s3_uri::S3Uri; -use tracing::{info, warn}; - const MAX_PARALLEL_UPLOADS: usize = 10; /// Upload all files from 'local' to 'remote' diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index 52bbef420f..cf4dab7c02 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -1,6 +1,7 @@ -use anyhow::Result; use std::str::FromStr; +use anyhow::Result; + /// Struct to hold parsed S3 components #[derive(Debug, Clone, PartialEq, Eq)] pub struct S3Uri { diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 28b10ce21c..2a7f56e6fc 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,18 +1,20 @@ +use std::path::Path; +use std::process::Stdio; +use std::result::Result; +use std::sync::Arc; + +use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; -use std::{path::Path, process::Stdio, result::Result, sync::Arc}; -use tokio::{ - io::{AsyncBufReadExt, BufReader}, - process::Command, - spawn, -}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::spawn; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; -use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); @@ -55,7 +57,7 @@ pub enum SchemaDumpError { pub async fn get_database_schema( compute: &Arc, dbname: &str, -) -> Result>, SchemaDumpError> { +) -> Result> + use<>, SchemaDumpError> { let pgbin = &compute.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 62d61a8bc9..e4207876ac 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Ok, Result}; +use anyhow::{Ok, Result, anyhow}; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d323ea3dcd..97fa45062b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,42 +1,37 @@ use std::collections::{HashMap, HashSet}; -use std::env; -use std::fs; use std::iter::once; -use std::os::unix::fs::{symlink, PermissionsExt}; +use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::AtomicU32; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; +use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::{Database, PgIdent, Role}; +use compute_api::privilege::Privilege; +use compute_api::responses::{ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ + ComputeFeature, ComputeMode, ComputeSpec, Database, ExtVersion, PgIdent, Role, +}; +use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; -use futures::StreamExt; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use postgres; -use postgres::error::SqlState; use postgres::NoTls; +use postgres::error::SqlState; +use remote_storage::{DownloadError, RemotePath}; +use tokio::spawn; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; - -use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; -use nix::sys::signal::{kill, Signal}; -use remote_storage::{DownloadError, RemotePath}; -use tokio::spawn; - use crate::installed_extensions::get_installed_extensions; -use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; use crate::spec_apply::ApplySpecPhase::{ @@ -45,13 +40,12 @@ use crate::spec_apply::ApplySpecPhase::{ HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, }; -use crate::spec_apply::PerDatabasePhase; use crate::spec_apply::PerDatabasePhase::{ ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, }; -use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; +use crate::spec_apply::{DB, MutableApplyContext, PerDatabasePhase, apply_operations}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; -use crate::{config, extension_server}; +use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); @@ -116,7 +110,23 @@ pub struct ComputeState { /// compute wasn't used since start. pub last_active: Option>, pub error: Option, + + /// Compute spec. This can be received from the CLI or - more likely - + /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + + /// If the spec is passed by a /configure request, 'startup_span' is the + /// /configure request's tracing span. The main thread enters it when it + /// processes the compute startup, so that the compute startup is considered + /// to be part of the /configure request for tracing purposes. + /// + /// If the request handling thread/task called startup_compute() directly, + /// it would automatically be a child of the request handling span, and we + /// wouldn't need this. But because we use the main thread to perform the + /// startup, and the /configure task just waits for it to finish, we need to + /// set up the span relationship ourselves. + pub startup_span: Option, + pub metrics: ComputeMetrics, } @@ -128,6 +138,7 @@ impl ComputeState { last_active: None, error: None, pspec: None, + startup_span: None, metrics: ComputeMetrics::default(), } } @@ -546,6 +557,7 @@ impl ComputeNode { // Fast path for sync_safekeepers. If they're already synced we get the lsn // in one roundtrip. If not, we should do a full sync_safekeepers. + #[instrument(skip_all)] pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); @@ -776,8 +788,9 @@ impl ComputeNode { Ok(()) } - /// Start Postgres as a child process and manage DBs/roles. - /// After that this will hang waiting on the postmaster process to exit. + /// Start Postgres as a child process and wait for it to start accepting + /// connections. + /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] pub fn start_postgres( @@ -1317,7 +1330,7 @@ impl ComputeNode { // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; - if let Some(ref local_proxy) = &spec.clone().local_proxy_config { + if let Some(local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } @@ -1537,7 +1550,9 @@ impl ComputeNode { &postgresql_conf_path, "neon.disable_logical_replication_subscribers=false", )? { - info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); + info!( + "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" + ); } self.pg_reload_conf()?; } @@ -1764,7 +1779,9 @@ LIMIT 100", info!("extension already downloaded, skipping re-download"); return Ok(0); } else if start_time_delta < HANG_TIMEOUT && !first_try { - info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout"); + info!( + "download {ext_archive_name} already started by another process, hanging untill completion or timeout" + ); let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500)); loop { info!("waiting for download"); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e1bdfffa54..e8056ec7eb 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -4,11 +4,10 @@ use std::io::prelude::*; use std::path::Path; use anyhow::Result; - -use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value}; + /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. pub fn line_in_file(path: &Path, line: &str) -> Result { diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index d88f26ca20..d97bd37285 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -1,9 +1,8 @@ use std::sync::Arc; use std::thread; -use tracing::{error, info, instrument}; - use compute_api::responses::ComputeStatus; +use tracing::{error, info, instrument}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs index e838c5b9fd..1353ab938d 100644 --- a/compute_tools/src/disk_quota.rs +++ b/compute_tools/src/disk_quota.rs @@ -1,9 +1,11 @@ use anyhow::Context; +use tracing::instrument; pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; /// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. /// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +#[instrument] pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { let size_kb = size_bytes / 1024; // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 00f46386e7..77e98359ab 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::Result; -use anyhow::{bail, Context}; +use std::path::Path; +use std::str; + +use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; -use std::path::Path; -use std::str; use tar::Archive; use tracing::info; use tracing::log::warn; @@ -244,7 +244,10 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { info!("writing file {:?}{:?}", control_path, control_content); std::fs::write(control_path, control_content).unwrap(); } else { - warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path); + warn!( + "control file {:?} exists both locally and remotely. ignoring the remote version.", + control_path + ); } } } diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs index 104cc25d5f..1d32e4ff37 100644 --- a/compute_tools/src/http/extract/json.rs +++ b/compute_tools/src/http/extract/json.rs @@ -1,6 +1,7 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::JsonRejection, FromRequest, Request}; +use axum::extract::rejection::JsonRejection; +use axum::extract::{FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs index 09637a96a4..45970cff3d 100644 --- a/compute_tools/src/http/extract/path.rs +++ b/compute_tools/src/http/extract/path.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::PathRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::PathRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Path` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs index 9dec3642cf..b8079ea770 100644 --- a/compute_tools/src/http/extract/query.rs +++ b/compute_tools/src/http/extract/query.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::QueryRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::QueryRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Query` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 93eb6ef5b7..d182278174 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,6 +1,8 @@ -use axum::{body::Body, response::Response}; +use axum::body::Body; +use axum::response::Response; use compute_api::responses::{ComputeStatus, GenericAPIError}; -use http::{header::CONTENT_TYPE, StatusCode}; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Serialize; use tracing::error; diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs index d7feb055e9..5a12686fa8 100644 --- a/compute_tools/src/http/routes/check_writability.rs +++ b/compute_tools/src/http/routes/check_writability.rs @@ -1,10 +1,13 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; +use crate::checker::check_writability; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Check that the compute is currently running. pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 2546cbc344..63d428fff4 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,18 +1,16 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ConfigurationRequest, - responses::{ComputeStatus, ComputeStatusResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{ComputeNode, ParsedSpec}, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::{ComputeNode, ParsedSpec}; +use crate::http::JsonResponse; +use crate::http::extract::Json; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -47,13 +45,18 @@ pub(in crate::http) async fn configure( return JsonResponse::invalid_status(state.status); } + // Pass the tracing span to the main thread that performs the startup, + // so that the start_compute operation is considered a child of this + // configure request for tracing purposes. + state.startup_span = Some(tracing::Span::current()); + state.pspec = Some(pspec); state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is - // needed to do not block the main pool of workers and be able to serve + // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs index fd716272dc..1f6ca4b79d 100644 --- a/compute_tools/src/http/routes/database_schema.rs +++ b/compute_tools/src/http/routes/database_schema.rs @@ -1,14 +1,16 @@ use std::sync::Arc; -use axum::{body::Body, extract::State, response::Response}; -use http::{header::CONTENT_TYPE, StatusCode}; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Deserialize; -use crate::{ - catalog::{get_database_schema, SchemaDumpError}, - compute::ComputeNode, - http::{extract::Query, JsonResponse}, -}; +use crate::catalog::{SchemaDumpError, get_database_schema}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Query; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct DatabaseSchemaParams { diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs index 4843c3fab4..790fe0dfe3 100644 --- a/compute_tools/src/http/routes/dbs_and_roles.rs +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -1,9 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; +use crate::catalog::get_dbs_and_roles; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get the databases and roles from the compute. pub(in crate::http) async fn get_catalog_objects( diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 5cc9b6d277..b0265d1e99 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -1,19 +1,13 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::{ - compute::ComputeNode, - http::{ - extract::{Path, Query}, - JsonResponse, - }, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::{Path, Query}; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs index 1fc03b9109..910e1fa155 100644 --- a/compute_tools/src/http/routes/extensions.rs +++ b/compute_tools/src/http/routes/extensions.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ExtensionInstallRequest, - responses::{ComputeStatus, ExtensionInstallResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ExtensionInstallRequest; +use compute_api::responses::{ComputeStatus, ExtensionInstallResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Install a extension. pub(in crate::http) async fn install_extension( diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 836417d784..8f5da99963 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -17,7 +17,8 @@ pub struct FailpointConfig { pub actions: String, } -use crate::http::{extract::Json, JsonResponse}; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Configure failpoints for testing purposes. pub(in crate::http) async fn configure_failpoints( diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs index 3f67f011e5..267dcbb27e 100644 --- a/compute_tools/src/http/routes/grants.rs +++ b/compute_tools/src/http/routes/grants.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::SetRoleGrantsRequest, - responses::{ComputeStatus, SetRoleGrantsResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::SetRoleGrantsRequest; +use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Add grants for a role. pub(in crate::http) async fn add_grant( diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs index 6b03a461c3..b1ba67161e 100644 --- a/compute_tools/src/http/routes/insights.rs +++ b/compute_tools/src/http/routes/insights.rs @@ -1,10 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Collect current Postgres usage insights. pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index 13150a7588..da8d8b20a5 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -1,10 +1,12 @@ -use axum::{body::Body, response::Response}; -use http::header::CONTENT_TYPE; +use axum::body::Body; +use axum::response::Response; use http::StatusCode; +use http::header::CONTENT_TYPE; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; -use crate::{http::JsonResponse, metrics::collect}; +use crate::http::JsonResponse; +use crate::metrics::collect; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs index 0709db5011..bc35ee2645 100644 --- a/compute_tools/src/http/routes/metrics_json.rs +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -1,9 +1,11 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get startup metrics. pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs index d64d53a58f..8ed1299d6b 100644 --- a/compute_tools/src/http/routes/status.rs +++ b/compute_tools/src/http/routes/status.rs @@ -1,9 +1,13 @@ -use std::{ops::Deref, sync::Arc}; +use std::ops::Deref; +use std::sync::Arc; -use axum::{extract::State, http::StatusCode, response::Response}; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::Response; use compute_api::responses::ComputeStatusResponse; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Retrieve the state of the comute. pub(in crate::http) async fn get_status(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 7acd84f236..2c24d4ad6b 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -1,18 +1,14 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use compute_api::responses::ComputeStatus; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{forward_termination_signal, ComputeNode}, - http::JsonResponse, -}; +use crate::compute::{ComputeNode, forward_termination_signal}; +use crate::http::JsonResponse; /// Terminate the compute. pub(in crate::http) async fn terminate(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index a523ecd96f..7283401bb5 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,23 +1,20 @@ -use std::{ - fmt::Display, - net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::Arc, - time::Duration, -}; +use std::fmt::Display; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Result; -use axum::{ - extract::Request, - middleware::{self, Next}, - response::{IntoResponse, Response}, - routing::{get, post}, - Router, -}; +use axum::Router; +use axum::extract::Request; +use axum::middleware::{self, Next}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{get, post}; use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; -use tracing::{debug, error, info, Span}; +use tower_http::request_id::PropagateRequestIdLayer; +use tower_http::trace::TraceLayer; +use tracing::{Span, debug, error, info}; use uuid::Uuid; use super::routes::{ @@ -124,6 +121,7 @@ impl From for Router> { ) .layer(PropagateRequestIdLayer::x_request_id()), ) + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) } } diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 173dbf40b0..6921505466 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,7 +1,7 @@ -use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use anyhow::Result; +use compute_api::responses::{InstalledExtension, InstalledExtensions}; use postgres::{Client, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3061d387a5..b4ec675ff4 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -1,17 +1,15 @@ -use anyhow::bail; -use anyhow::Result; -use postgres::{NoTls, SimpleQueryMessage}; -use std::time::SystemTime; -use std::{str::FromStr, sync::Arc, thread, time::Duration}; -use utils::id::TenantId; -use utils::id::TimelineId; +use std::str::FromStr; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, SystemTime}; +use anyhow::{Result, bail}; use compute_api::spec::ComputeMode; +use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 870b294d08..bc96e5074c 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,6 @@ use metrics::core::Collector; use metrics::proto::MetricFamily; -use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 184f380a8d..248505e473 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use std::{thread, time::Duration}; +use std::thread; +use std::time::Duration; use chrono::{DateTime, Utc}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeFeature; use postgres::{Client, NoTls}; use tracing::{debug, error, info, warn}; use crate::compute::ComputeNode; -use compute_api::responses::ComputeStatus; -use compute_api::spec::ComputeFeature; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 86fcf99085..5a2e305e1d 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,7 +9,8 @@ use std::process::Child; use std::str::FromStr; use std::time::{Duration, Instant}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; @@ -21,8 +22,6 @@ use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; -use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; - const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal. diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6f28bd9733..1d19f2738d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,20 +1,20 @@ -use anyhow::{anyhow, bail, Result}; -use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use tokio_postgres::Client; -use tracing::{error, info, instrument, warn}; - -use crate::config; -use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; -use crate::migration::MigrationRunner; -use crate::params::PG_HBA_ALL_MD5; -use crate::pg_helpers::*; +use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, }; use compute_api::spec::ComputeSpec; +use reqwest::StatusCode; +use tokio_postgres::Client; +use tracing::{error, info, instrument, warn}; + +use crate::config; +use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; +use crate::migration::MigrationRunner; +use crate::params::PG_HBA_ALL_MD5; +use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -141,7 +141,6 @@ pub fn get_spec_from_control_plane( /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("checking pg_hba.conf"); let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -156,12 +155,11 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("adding standby.signal"); let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { - info!("created standby.signal"); File::create(signalfile)?; + info!("created standby.signal"); } else { info!("reused pre-existing standby.signal"); } @@ -170,7 +168,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { #[instrument(skip_all)] pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); client.simple_query(query).await?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 5ee9c5fbd8..b4e084fd91 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -1,18 +1,18 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::future::Future; -use std::iter::empty; -use std::iter::once; +use std::iter::{empty, once}; use std::sync::Arc; -use crate::compute::construct_superuser_query; -use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::{bail, Result}; +use anyhow::Result; use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, Instrument}; +use tracing::{Instrument, debug, info_span, warn}; + +use crate::compute::construct_superuser_query; +use crate::pg_helpers::{DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal}; #[derive(Clone)] pub enum DB { @@ -47,6 +47,11 @@ pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, HandleAnonExtension, + /// This is a shared phase, used for both i) dropping dangling LR subscriptions + /// before dropping the DB, and ii) dropping all subscriptions after creating + /// a fresh branch. + /// N.B. we will skip all DBs that are not present in Postgres, invalid, or + /// have `datallowconn = false` (`restrict_conn`). DropLogicalSubscriptions, } @@ -168,7 +173,7 @@ where /// /// In the future we may generate a single stream of changes and then /// sort/merge/batch execution, but for now this is a nice way to improve -/// batching behaviour of the commands. +/// batching behavior of the commands. async fn get_operations<'a>( spec: &'a ComputeSpec, ctx: &'a RwLock, @@ -451,6 +456,41 @@ async fn get_operations<'a>( )), }))), ApplySpecPhase::RunInEachDatabase { db, subphase } => { + // Do some checks that user DB exists and we can access it. + // + // During the phases like DropLogicalSubscriptions, DeleteDBRoleReferences, + // which happen before dropping the DB, the current run could be a retry, + // so it's a valid case when DB is absent already. The case of + // `pg_database.datallowconn = false`/`restrict_conn` is a bit tricky, as + // in theory user can have some dangling objects there, so we will fail at + // the actual drop later. Yet, to fix that in the current code we would need + // to ALTER DATABASE, and then check back, but that even more invasive, so + // that's not what we really want to do here. + // + // For ChangeSchemaPerms, skipping DBs we cannot access is totally fine. + if let DB::UserDB(db) = db { + let databases = &ctx.read().await.dbs; + + let edb = match databases.get(&db.name) { + Some(edb) => edb, + None => { + warn!( + "skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", + subphase, db.name + ); + return Ok(Box::new(empty())); + } + }; + + if edb.restrict_conn || edb.invalid { + warn!( + "skipping RunInEachDatabase phase {:?}, database {} is (restrict_conn={}, invalid={})", + subphase, db.name, edb.restrict_conn, edb.invalid + ); + return Ok(Box::new(empty())); + } + } + match subphase { PerDatabasePhase::DropLogicalSubscriptions => { match &db { @@ -530,25 +570,12 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } PerDatabasePhase::ChangeSchemaPerms => { - let ctx = ctx.read().await; - let databases = &ctx.dbs; - let db = match &db { // ignore schema permissions on the system database DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; - if databases.get(&db.name).is_none() { - bail!("database {} doesn't exist in PostgreSQL", db.name); - } - - let edb = databases.get(&db.name).unwrap(); - - if edb.restrict_conn || edb.invalid { - return Ok(Box::new(empty())); - } - let operations = vec![ Operation { query: format!( @@ -566,6 +593,7 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663 PerDatabasePhase::HandleAnonExtension => { // Only install Anon into user databases let db = match &db { diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index dfb925e48e..03e8e158fa 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,6 +2,7 @@ DO $$ DECLARE subname TEXT; BEGIN + LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index 024c5b338e..ed27a7cba4 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,10 +1,11 @@ use std::path::Path; -use anyhow::{anyhow, Context}; -use tracing::warn; +use anyhow::{Context, anyhow}; +use tracing::{instrument, warn}; pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; +#[instrument] pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { // run `/neonvm/bin/resize-swap --once {size_bytes}` // diff --git a/compute_tools/tests/config_test.rs b/compute_tools/tests/config_test.rs index 9ab16b1930..7b2bff23d5 100644 --- a/compute_tools/tests/config_test.rs +++ b/compute_tools/tests/config_test.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod config_tests { - use std::fs::{remove_file, File}; + use std::fs::{File, remove_file}; use std::io::{Read, Write}; use std::path::Path; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 02d793400a..7d908ccae9 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -887,20 +887,6 @@ fn print_timeline( Ok(()) } -/// Returns a map of timeline IDs to timeline_id@lsn strings. -/// Connects to the pageserver to query this information. -async fn get_timeline_infos( - env: &local_env::LocalEnv, - tenant_shard_id: &TenantShardId, -) -> Result> { - Ok(get_default_pageserver(env) - .timeline_list(tenant_shard_id) - .await? - .into_iter() - .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) - .collect()) -} - /// Helper function to get tenant id from an optional --tenant_id option or from the config file fn get_tenant_id( tenant_id_arg: Option, @@ -1251,12 +1237,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_shard_id) - .await - .unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -1285,12 +1265,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res lsn.to_string() } _ => { - // -> primary endpoint or hot replica - // Use the LSN at the end of the timeline. - timeline_infos - .get(&endpoint.timeline_id) - .map(|bi| bi.last_record_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()) + // As the LSN here refers to the one that the compute is started with, + // we display nothing as it is a primary/hot standby compute. + "---".to_string() } }; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index c3c8229c38..407578abb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -46,6 +46,8 @@ use std::process::Command; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; use anyhow::{anyhow, bail, Context, Result}; use compute_api::requests::ConfigurationRequest; @@ -59,6 +61,7 @@ use nix::sys::signal::Signal; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; use serde::{Deserialize, Serialize}; +use tracing::debug; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; @@ -81,8 +84,10 @@ pub struct EndpointConf { internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + reconfigure_concurrency: usize, drop_subscriptions_before_start: bool, features: Vec, + cluster: Option, } // @@ -179,7 +184,9 @@ impl ComputeControlPlane { // we also skip catalog updates in the cloud. skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, }); ep.create_endpoint_dir()?; @@ -196,7 +203,9 @@ impl ComputeControlPlane { pg_version, skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, })?, )?; std::fs::write( @@ -261,8 +270,11 @@ pub struct Endpoint { skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, + reconfigure_concurrency: usize, // Feature flags features: Vec, + // Cluster settings + cluster: Option, } #[derive(PartialEq, Eq)] @@ -302,6 +314,8 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; + debug!("serialized endpoint conf: {:?}", conf); + Ok(Endpoint { pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), external_http_address: SocketAddr::new( @@ -319,8 +333,10 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + reconfigure_concurrency: conf.reconfigure_concurrency, drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, + cluster: conf.cluster, }) } @@ -607,7 +623,7 @@ impl Endpoint { }; // Create spec file - let spec = ComputeSpec { + let mut spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, @@ -640,7 +656,7 @@ impl Endpoint { Vec::new() }, settings: None, - postgresql_conf: Some(postgresql_conf), + postgresql_conf: Some(postgresql_conf.clone()), }, delta_operations: None, tenant_id: Some(self.tenant_id), @@ -653,9 +669,35 @@ impl Endpoint { pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, - reconfigure_concurrency: 1, + reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }); + spec.cluster.databases.push(Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } + let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -673,18 +715,14 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - //cmd.args([ - // "--external-http-port", - // &self.external_http_address.port().to_string(), - //]) - //.args([ - // "--internal-http-port", - // &self.internal_http_address.port().to_string(), - //]) cmd.args([ - "--http-port", + "--external-http-port", &self.external_http_address.port().to_string(), ]) + .args([ + "--internal-http-port", + &self.internal_http_address.port().to_string(), + ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) .args([ @@ -701,20 +739,16 @@ impl Endpoint { ]) // TODO: It would be nice if we generated compute IDs with the same // algorithm as the real control plane. - // - // TODO: Add this back when - // https://github.com/neondatabase/neon/pull/10747 is merged. - // - //.args([ - // "--compute-id", - // &format!( - // "compute-{}", - // SystemTime::now() - // .duration_since(UNIX_EPOCH) - // .unwrap() - // .as_secs() - // ), - //]) + .args([ + "--compute-id", + &format!( + "compute-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + ), + ]) .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 28d130d9e0..2bf89b7bfa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -335,13 +335,21 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings + .remove("checkpoint_timeout") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_period: settings + .remove("compaction_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) @@ -387,7 +395,10 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.remove("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) @@ -403,13 +414,20 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, - pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + pitr_interval: settings.remove("pitr_interval") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) @@ -427,8 +445,14 @@ impl PageServerNode { .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") - .map(|x| x.to_string()), - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, + heatmap_period: settings + .remove("heatmap_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) @@ -439,10 +463,15 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length: settings.remove("lsn_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 3c574efc63..40b86e4110 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -47,6 +47,9 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + listen_https_port: Option, + #[arg(long)] availability_zone_id: String, }, @@ -394,6 +397,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id, } => { storcon_client @@ -406,6 +410,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), }), ) @@ -954,7 +959,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), - heatmap_period: Some("300s".to_string()), + heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index b4f8d3d66a..9dbdcce69f 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -77,4 +77,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ + --compute-id "compute-$RANDOM" \ -S ${SPEC_FILE} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 489d60f38c..95d4ff7b2a 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index dd520d4986..0f03d600a3 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -51,8 +51,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do done if [ $pg_version -ge 16 ]; then - docker cp ext-src $TEST_CONTAINER_NAME:/ - docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config @@ -81,15 +79,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') for d in $FAILED $CONTRIB_FAILED; do - dn="$(basename $d)" - rm -rf $dn - mkdir $dn - docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ] - docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ] - cat $dn/regression.out $dn/regression.diffs || true - rm -rf $dn + docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ] done - rm -rf $FAILED exit 1 fi fi diff --git a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh new file mode 100755 index 0000000000..5021eb4027 --- /dev/null +++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress +${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index 4a9024569b..c399109eb9 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -6,8 +6,8 @@ generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } -if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then - echo OLDTAG and NEWTAG must be defined +if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then + echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined exit 1 fi export PG_VERSION=${PG_VERSION:-16} @@ -43,10 +43,11 @@ EXTENSIONS='[ {"extname": "semver", "extdir": "pg_semver-src"}, {"extname": "pg_ivm", "extdir": "pg_ivm-src"}, {"extname": "pgjwt", "extdir": "pgjwt-src"}, -{"extname": "pgtap", "extdir": "pgtap-src"} +{"extname": "pgtap", "extdir": "pgtap-src"}, +{"extname": "pg_repack", "extdir": "pg_repack-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) -TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" @@ -54,9 +55,8 @@ create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down -TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready -docker compose cp ext-src neon-test-extensions:/ docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression" @@ -86,8 +86,8 @@ else ) result=$(curl "${PARAMS[@]}") echo $result | jq . - TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready - COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready + TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose down compute compute_is_ready + COMPUTE_TAG=${NEW_COMPUTE_TAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready wait_for_ready TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") if [ ${TID} != ${new_timeline_id} ]; then diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c11a1b6688..0d1618c1b2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 0c256cae2e..3fbdfcf83f 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,11 +1,10 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use crate::{ - privilege::Privilege, - responses::ComputeCtlConfig, - spec::{ComputeSpec, ExtVersion, PgIdent}, -}; use serde::{Deserialize, Serialize}; +use crate::privilege::Privilege; +use crate::responses::ComputeCtlConfig; +use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a6248019d9..35c580bd37 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,10 +6,8 @@ use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; -use crate::{ - privilege::Privilege, - spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, -}; +use crate::privilege::Privilege; +use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 767a34bcbc..d02bfd6814 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,13 +5,12 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use regex::Regex; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use regex::Regex; -use remote_storage::RemotePath; - /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; @@ -252,7 +251,7 @@ pub enum ComputeMode { Replica, } -#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, pub name: Option, @@ -283,7 +282,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -292,7 +291,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -308,7 +307,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct GenericOption { pub name: String, pub value: Option, @@ -339,9 +338,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use std::fs::File; + use super::*; + #[test] fn allow_installing_remote_extensions() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 79da05da6c..87dfdfb5ec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 0f33bcf45b..039cc1319e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -9,19 +9,18 @@ pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use std::collections::HashMap; +use std::num::{NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::Duration; + use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; -use std::{ - collections::HashMap, - num::{NonZeroU64, NonZeroUsize}, - str::FromStr, - time::Duration, -}; -use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol}; +use utils::logging::LogFormat; +use utils::postgres_client::PostgresClientProtocol; -use crate::models::ImageCompressionAlgorithm; -use crate::models::LsnLease; +use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver @@ -122,6 +121,8 @@ pub struct ConfigToml { pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub validate_wal_contiguity: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -365,10 +366,10 @@ pub struct TenantConfigToml { } pub mod defaults { - use crate::models::ImageCompressionAlgorithm; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + use crate::models::ImageCompressionAlgorithm; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -521,6 +522,7 @@ impl Default for ConfigToml { } else { None }, + validate_wal_contiguity: None, } } } @@ -581,7 +583,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; - pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 42f6e47e63..2cfe1a85f9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -9,11 +9,8 @@ use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; -use crate::models::PageserverUtilization; -use crate::{ - models::{ShardParameters, TenantConfig}, - shard::{ShardStripeSize, TenantShardId}, -}; +use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; +use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -57,6 +54,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, } @@ -105,6 +103,7 @@ pub struct TenantLocateResponseShard { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, } #[derive(Serialize, Deserialize)] @@ -148,6 +147,7 @@ pub struct NodeDescribeResponse { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub listen_pg_addr: String, pub listen_pg_port: u16, @@ -351,7 +351,7 @@ impl FromStr for SkSchedulingPolicy { _ => { return Err(anyhow::anyhow!( "Unknown scheduling policy '{s}', try active,pause,decomissioned" - )) + )); } }) } @@ -454,9 +454,10 @@ pub struct SafekeeperSchedulingPolicyRequest { #[cfg(test)] mod test { - use super::*; use serde_json; + use super::*; + /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index b88a2e46a1..8836e7ec87 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,11 +1,12 @@ -use anyhow::{bail, Result}; -use byteorder::{ByteOrder, BE}; +use std::fmt; +use std::ops::Range; + +use anyhow::{Result, bail}; +use byteorder::{BE, ByteOrder}; use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::Oid; -use postgres_ffi::RepOriginId; +use postgres_ffi::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; -use std::{fmt, ops::Range}; use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -954,25 +955,22 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; - use crate::key::is_metadata_key_slice; - use crate::key::Key; - - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use super::AUX_KEY_PREFIX; + use crate::key::{Key, is_metadata_key_slice}; #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { - field1: rng.gen(), - field2: rng.gen(), - field3: rng.gen(), - field4: rng.gen(), - field5: rng.gen(), - field6: rng.gen(), + field1: rng.r#gen(), + field2: rng.r#gen(), + field3: rng.r#gen(), + field4: rng.r#gen(), + field5: rng.r#gen(), + field6: rng.r#gen(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c55b9e9484..e505f23e49 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,10 @@ -use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::{ - key::Key, - shard::{ShardCount, ShardIdentity}, -}; use itertools::Itertools; +use postgres_ffi::BLCKSZ; + +use crate::key::Key; +use crate::shard::{ShardCount, ShardIdentity}; /// /// Represents a set of Keys, in a compact form. @@ -609,15 +608,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use std::fmt::Write; + use rand::{RngCore, SeedableRng}; - use crate::{ - models::ShardParameters, - shard::{ShardCount, ShardNumber}, - }; - use super::*; - use std::fmt::Write; + use crate::models::ShardParameters; + use crate::shard::{ShardCount, ShardNumber}; // Helper function to create a key range. // diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index dd7bea2916..ea565e7769 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,38 +2,30 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; -#[cfg(feature = "testing")] -use camino::Utf8PathBuf; -pub use utilization::PageserverUtilization; - use core::ops::Range; -use std::{ - collections::HashMap, - fmt::Display, - io::{BufRead, Read}, - num::{NonZeroU32, NonZeroU64, NonZeroUsize}, - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::fmt::Display; +use std::io::{BufRead, Read}; +use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; use postgres_ffi::BLCKSZ; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; -use utils::{ - completion, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, - postgres_client::PostgresClientProtocol, - serde_system_time, -}; +pub use utilization::PageserverUtilization; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::PostgresClientProtocol; +use utils::{completion, serde_system_time}; -use crate::{ - key::{CompactKey, Key}, - reltag::RelTag, - shard::{ShardCount, ShardStripeSize, TenantShardId}, -}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crate::key::{CompactKey, Key}; +use crate::reltag::RelTag; +use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -332,7 +324,8 @@ pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { - use rand::{distributions::Alphanumeric, Rng}; + use rand::Rng; + use rand::distributions::Alphanumeric; Self( rand::thread_rng() .sample_iter(&Alphanumeric) @@ -526,9 +519,13 @@ pub struct TenantConfigPatch { #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Option, pub compaction_target_size: Option, - pub compaction_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub compaction_period: Option, pub compaction_threshold: Option, pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy @@ -539,22 +536,38 @@ pub struct TenantConfig { pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, pub gc_horizon: Option, - pub gc_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub gc_period: Option, pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, - pub heatmap_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, pub image_creation_preempt_threshold: Option, - pub lsn_lease_length: Option, - pub lsn_lease_length_for_ts: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, pub rel_size_v2_enabled: Option, @@ -564,7 +577,10 @@ pub struct TenantConfig { } impl TenantConfig { - pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig { + pub fn apply_patch( + self, + patch: TenantConfigPatch, + ) -> Result { let Self { mut checkpoint_distance, mut checkpoint_timeout, @@ -604,11 +620,17 @@ impl TenantConfig { } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); - patch.checkpoint_timeout.apply(&mut checkpoint_timeout); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); patch .compaction_target_size .apply(&mut compaction_target_size); - patch.compaction_period.apply(&mut compaction_period); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); patch .compaction_upper_limit @@ -626,15 +648,25 @@ impl TenantConfig { .apply(&mut l0_flush_stall_threshold); patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); - patch.gc_period.apply(&mut gc_period); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); patch .image_creation_threshold .apply(&mut image_creation_threshold); - patch.pitr_interval.apply(&mut pitr_interval); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); patch .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? .apply(&mut walreceiver_connect_timeout); - patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); patch.eviction_policy.apply(&mut eviction_policy); patch @@ -642,8 +674,12 @@ impl TenantConfig { .apply(&mut min_resident_size_override); patch .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? .apply(&mut evictions_low_residence_duration_metric_threshold); - patch.heatmap_period.apply(&mut heatmap_period); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); patch.lazy_slru_download.apply(&mut lazy_slru_download); patch .timeline_get_throttle @@ -654,9 +690,13 @@ impl TenantConfig { patch .image_creation_preempt_threshold .apply(&mut image_creation_preempt_threshold); - patch.lsn_lease_length.apply(&mut lsn_lease_length); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); patch @@ -673,7 +713,7 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); - Self { + Ok(Self { checkpoint_distance, checkpoint_timeout, compaction_target_size, @@ -709,7 +749,7 @@ impl TenantConfig { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, - } + }) } } @@ -2241,9 +2281,10 @@ impl Default for PageTraceEvent { #[cfg(test)] mod tests { - use serde_json::json; use std::str::FromStr; + use serde_json::json; + use super::*; #[test] @@ -2503,7 +2544,7 @@ mod tests { ..base.clone() }; - let patched = base.apply_patch(decoded.config); + let patched = base.apply_patch(decoded.config).unwrap(); assert_eq!(patched, expected); } diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 641aa51989..69c240ff3c 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,5 +1,7 @@ use std::time::SystemTime; -use utils::{serde_percent::Percent, serde_system_time}; + +use utils::serde_percent::Percent; +use utils::serde_system_time; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -131,12 +133,12 @@ impl PageserverUtilization { /// Test helper pub mod test_utilization { - use super::PageserverUtilization; use std::time::SystemTime; - use utils::{ - serde_percent::Percent, - serde_system_time::{self}, - }; + + use utils::serde_percent::Percent; + use utils::serde_system_time::{self}; + + use super::PageserverUtilization; // Parameters of the imaginary node used for test utilization instances const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index bb62b35d36..fda504a26e 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -1,7 +1,7 @@ //! This module defines the WAL record format used within the pageserver. use bytes::Bytes; -use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember}; +use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 09d1fae221..473a44dbf9 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -1,10 +1,10 @@ -use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; +use serde::{Deserialize, Serialize}; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e03df02afb..eca04b1f3d 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -33,12 +33,13 @@ use std::hash::{Hash, Hasher}; -use crate::{key::Key, models::ShardParameters}; +#[doc(inline)] +pub use ::utils::shard::*; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -#[doc(inline)] -pub use ::utils::shard::*; +use crate::key::Key; +use crate::models::ShardParameters; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -337,7 +338,8 @@ pub fn describe( mod tests { use std::str::FromStr; - use utils::{id::TenantId, Hex}; + use utils::Hex; + use utils::id::TenantId; use super::*; diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 2e88836bd0..647d01c3c2 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{ - controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, -}; +use crate::controller_api::NodeRegisterRequest; +use crate::models::LocationConfigMode; +use crate::shard::TenantShardId; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -30,7 +30,7 @@ fn default_mode() -> LocationConfigMode { pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode - pub gen: Option, + pub r#gen: Option, /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] @@ -44,7 +44,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { pub id: TenantShardId, - pub gen: u32, + pub r#gen: u32, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 1f8ed30a9a..883d903ff3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -7,10 +7,11 @@ //! Note that the [`Value`] type is used for the permananent storage format, so any //! changes to it must be backwards compatible. -use crate::record::NeonWalRecord; use bytes::Bytes; use serde::{Deserialize, Serialize}; +use crate::record::NeonWalRecord; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value @@ -83,11 +84,11 @@ impl ValueBytes { #[cfg(test)] mod test { - use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; + use super::*; + macro_rules! roundtrip { ($orig:expr, $expected:expr) => {{ let orig: Value = $orig; diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 19027d13ff..462fb4a533 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] anyhow.workspace = true itertools.workspace = true -postgres.workspace = true tokio-postgres.workspace = true url.workspace = true diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ddf9f7b610..e3d31c6cfc 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -171,10 +171,10 @@ impl PgConnectionConfig { tokio_postgres::Client, tokio_postgres::Connection, ), - postgres::Error, + tokio_postgres::Error, > { self.to_tokio_postgres_config() - .connect(postgres::NoTls) + .connect(tokio_postgres::NoTls) .await } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0239b56d9c..301bc2f16e 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -278,7 +278,7 @@ pub fn generate_pg_control( checkpoint_bytes: &[u8], lsn: Lsn, pg_version: u32, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 852b20eace..14fb1f2a1f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN +/// +/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in +/// the pageserver. They use the same format as the PostgreSQL control file and the +/// checkpoint record, but see walingest.rs for how exactly they are kept up to date. +/// 'lsn' is the LSN at which we're starting up. +/// +/// Returns: +/// - pg_control file contents +/// - system_identifier, extracted from the persisted information +/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown +/// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; // Generate new pg_control needed for bootstrap + // + // NB: In the checkpoint struct that we persist in the pageserver, we have a different + // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, + // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to + // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. + // + // We didn't always have this convention however, and old persisted records will have + // old REDO values that point to some old LSN. + // + // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown + // checkpoint record at that point in WAL, with no new WAL records after it. That case + // can be treated as starting from a clean shutdown. All other cases are treated as + // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so + // that distinction doesn't matter very much. As of this writing, it only affects + // whether the persisted pg_stats information can be used or not. + // + // In the Checkpoint struct in the returned pg_control file, the redo pointer is + // always set to the LSN we're starting at, to hint that no WAL replay is required. + // (There's some neon-specific code in Postgres startup to make that work, though. + // Just setting the redo pointer is not sufficient.) + let was_shutdown = Lsn(checkpoint.redo) == lsn; checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //save new values in pg_control + // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The + // neon-specific code at postgres startup ignores the state stored in the control + // file, similar to archive recovery in standalone PostgreSQL. Similarly, the + // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; - Ok((pg_control.encode(), pg_control.system_identifier)) + Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index f66a292d5e..7ebb05eec1 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-protocol2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index f2200a40ce..27e05e24ec 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -1,14 +1,12 @@ //! SASL-based authentication support. +use std::fmt::Write; +use std::{io, iter, mem, str}; + use hmac::{Hmac, Mac}; use rand::{self, Rng}; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; -use std::fmt::Write; -use std::io; -use std::iter; -use std::mem; -use std::str; use tokio::task::yield_now; const NONCE_LENGTH: usize = 24; @@ -493,11 +491,9 @@ mod test { let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; - let server_first = - "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ + let server_first = "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ =4096"; - let client_final = - "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ + let client_final = "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs index 6032440f9a..afbd1e92bd 100644 --- a/libs/proxy/postgres-protocol2/src/lib.rs +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -11,9 +11,10 @@ //! set to `UTF8`. It will most likely not behave properly if that is not the case. #![warn(missing_docs, clippy::all)] +use std::io; + use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; -use std::io; pub mod authentication; pub mod escape; diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 097964f9c1..d7eaef9509 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -1,13 +1,13 @@ #![allow(missing_docs)] +use std::io::{self, Read}; +use std::ops::Range; +use std::{cmp, str}; + use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use memchr::memchr; -use std::cmp; -use std::io::{self, Read}; -use std::ops::Range; -use std::str; use crate::Oid; diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index 640f35ada3..b447290ea8 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -1,13 +1,13 @@ //! Frontend message serialization. #![allow(missing_docs)] +use std::error::Error; +use std::{io, marker}; + use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; -use std::error::Error; -use std::io; -use std::marker; -use crate::{write_nullable, FromUsize, IsNull, Oid}; +use crate::{FromUsize, IsNull, Oid, write_nullable}; #[inline] fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index 38eb31dfcf..4cd9bfb060 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -6,12 +6,13 @@ //! side. This is good because it ensures the cleartext password won't //! end up in logs pg_stat displays, etc. -use crate::authentication::sasl; use hmac::{Hmac, Mac}; use rand::RngCore; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; +use crate::authentication::sasl; + #[cfg(test)] mod test; diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs index 78131c05bf..6a9b334bcb 100644 --- a/libs/proxy/postgres-protocol2/src/types/mod.rs +++ b/libs/proxy/postgres-protocol2/src/types/mod.rs @@ -1,11 +1,12 @@ //! Conversions to and from Postgres's binary format for various types. -use byteorder::{BigEndian, ReadBytesExt}; -use bytes::{BufMut, BytesMut}; -use fallible_iterator::FallibleIterator; use std::boxed::Box as StdBox; use std::error::Error; use std::str; +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{BufMut, BytesMut}; +use fallible_iterator::FallibleIterator; + use crate::Oid; #[cfg(test)] diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml index 57efd94cd3..25ad23ba35 100644 --- a/libs/proxy/postgres-types2/Cargo.toml +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-types2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index d4f3afdfd4..0ccd8c295f 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -4,19 +4,18 @@ //! unless you want to define your own `ToSql` or `FromSql` definitions. #![warn(clippy::all, missing_docs)] -use fallible_iterator::FallibleIterator; -use postgres_protocol2::types; use std::any::type_name; use std::error::Error; use std::fmt; use std::sync::Arc; -use crate::type_gen::{Inner, Other}; - +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; +use postgres_protocol2::types; -use bytes::BytesMut; +use crate::type_gen::{Inner, Other}; /// Generates a simple implementation of `ToSql::accepts` which accepts the /// types passed to it. diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs index 774f9a301c..188b982812 100644 --- a/libs/proxy/postgres-types2/src/private.rs +++ b/libs/proxy/postgres-types2/src/private.rs @@ -1,7 +1,9 @@ -use crate::{FromSql, Type}; -pub use bytes::BytesMut; use std::error::Error; +pub use bytes::BytesMut; + +use crate::{FromSql, Type}; + pub fn read_be_i32(buf: &mut &[u8]) -> Result> { if buf.len() < 4 { return Err("invalid buffer size".into()); diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 161c6b8309..540876742f 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tokio-postgres2" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index cddbf16336..b65fb571e6 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -1,10 +1,11 @@ +use std::io; + use tokio::net::TcpStream; use crate::client::SocketConfig; use crate::config::{Host, SslMode}; use crate::tls::MakeTlsConnect; -use crate::{cancel_query_raw, connect_socket, Error}; -use std::io; +use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( config: Option, @@ -22,7 +23,7 @@ where return Err(Error::connect(io::Error::new( io::ErrorKind::InvalidInput, "unknown host", - ))) + ))); } }; diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs index 8c08296435..c720214e9b 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs @@ -1,10 +1,11 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; -use crate::{connect_tls, Error}; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use crate::config::SslMode; +use crate::tls::TlsConnect; +use crate::{Error, connect_tls}; + pub async fn cancel_query_raw( stream: S, mode: SslMode, diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index 718f903a92..f6526395ee 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -1,12 +1,12 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; - -use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; -use crate::{cancel_query_raw, Error}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::client::SocketConfig; +use crate::config::SslMode; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Error, cancel_query, cancel_query_raw}; + /// The capability to request cancellation of in-progress queries on a /// connection. #[derive(Clone, Serialize, Deserialize)] diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 46151ab924..39b1db75da 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,31 +1,28 @@ -use crate::codec::{BackendMessages, FrontendMessage}; - -use crate::config::Host; -use crate::config::SslMode; -use crate::connection::{Request, RequestMessages}; - -use crate::query::RowStream; -use crate::simple_query::SimpleQueryStream; - -use crate::types::{Oid, ToSql, Type}; - -use crate::{ - query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, Transaction, TransactionBuilder, -}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{future, ready, TryStreamExt}; -use parking_lot::Mutex; -use postgres_protocol2::message::{backend::Message, frontend}; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fmt; use std::sync::Arc; use std::task::{Context, Poll}; +use std::time::Duration; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{TryStreamExt, future, ready}; +use parking_lot::Mutex; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; -use std::time::Duration; +use crate::codec::{BackendMessages, FrontendMessage}; +use crate::config::{Host, SslMode}; +use crate::connection::{Request, RequestMessages}; +use crate::query::RowStream; +use crate::simple_query::SimpleQueryStream; +use crate::types::{Oid, ToSql, Type}; +use crate::{ + CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, slice_iter, +}; pub struct Responses { receiver: mpsc::Receiver, diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index 0ec46198ce..f1fd9b47b3 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -1,8 +1,9 @@ +use std::io; + use bytes::{Buf, Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; use postgres_protocol2::message::frontend::CopyData; -use std::io; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 47cc45ac80..4c25491b67 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,21 +1,19 @@ //! Connection configuration. -use crate::connect::connect; -use crate::connect_raw::connect_raw; -use crate::connect_raw::RawConnection; -use crate::tls::MakeTlsConnect; -use crate::tls::TlsConnect; -use crate::{Client, Connection, Error}; -use postgres_protocol2::message::frontend::StartupMessageParams; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::str; use std::time::Duration; -use tokio::io::{AsyncRead, AsyncWrite}; +use std::{fmt, str}; pub use postgres_protocol2::authentication::sasl::ScramKeys; +use postgres_protocol2::message::frontend::StartupMessageParams; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::connect::connect; +use crate::connect_raw::{RawConnection, connect_raw}; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Client, Connection, Error}; + /// TLS configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[non_exhaustive] diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index e0cb69748d..d2bd0dfbcd 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,7 @@ +use postgres_protocol2::message::backend::Message; +use tokio::net::TcpStream; +use tokio::sync::mpsc; + use crate::client::SocketConfig; use crate::codec::BackendMessage; use crate::config::Host; @@ -5,9 +9,6 @@ use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; -use postgres_protocol2::message::backend::Message; -use tokio::net::TcpStream; -use tokio::sync::mpsc; pub async fn connect( mut tls: T, diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 66db85e07d..20dc538cf2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -1,22 +1,24 @@ +use std::collections::HashMap; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready}; +use postgres_protocol2::authentication::sasl; +use postgres_protocol2::authentication::sasl::ScramSha256; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::codec::Framed; + +use crate::Error; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::{TlsConnect, TlsStream}; -use crate::Error; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; -use postgres_protocol2::authentication::sasl; -use postgres_protocol2::authentication::sasl::ScramSha256; -use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; -use postgres_protocol2::message::frontend; -use std::collections::HashMap; -use std::io; -use std::pin::Pin; -use std::task::{Context, Poll}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::codec::Framed; pub struct StartupStream { inner: Framed, PostgresCodec>, @@ -158,7 +160,7 @@ where | Some(Message::AuthenticationSspi) => { return Err(Error::authentication( "unsupported authentication method".into(), - )) + )); } Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 336a13317f..15411f7ef3 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,11 +1,13 @@ -use crate::config::Host; -use crate::Error; use std::future::Future; use std::io; use std::time::Duration; + use tokio::net::{self, TcpStream}; use tokio::time; +use crate::Error; +use crate::config::Host; + pub(crate) async fn connect_socket( host: &Host, port: u16, diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs index 64b0b68abc..4dc929a9e2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_tls.rs +++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs @@ -1,12 +1,13 @@ -use crate::config::SslMode; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::tls::private::ForcePrivateApi; -use crate::tls::TlsConnect; -use crate::Error; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use crate::Error; +use crate::config::SslMode; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::TlsConnect; +use crate::tls::private::ForcePrivateApi; + pub async fn connect_tls( mut stream: S, mode: SslMode, @@ -19,7 +20,7 @@ where match mode { SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { - return Ok(MaybeTlsStream::Raw(stream)) + return Ok(MaybeTlsStream::Raw(stream)); } SslMode::Prefer | SslMode::Require => {} } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index f478717e0d..60e39b3b44 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -1,22 +1,24 @@ -use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; -use crate::error::DbError; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::{AsyncMessage, Error, Notification}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, Stream}; -use log::{info, trace}; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::collections::{HashMap, VecDeque}; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, Stream, ready}; +use log::{info, trace}; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::error::DbError; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::{AsyncMessage, Error, Notification}; + pub enum RequestMessages { Single(FrontendMessage), } @@ -139,7 +141,7 @@ where Some(response) => response, None => match messages.next().map_err(Error::parse)? { Some(Message::ErrorResponse(error)) => { - return Poll::Ready(Err(Error::db(error))) + return Poll::Ready(Err(Error::db(error))); } _ => return Poll::Ready(Err(Error::unexpected_message())), }, diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 922c348525..b12e76e5bf 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -1,10 +1,10 @@ //! Errors. +use std::error::{self, Error as _Error}; +use std::{fmt, io}; + use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; -use std::error::{self, Error as _Error}; -use std::fmt; -use std::io; pub use self::sqlstate::*; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 042b5a675e..31c3d8fa3e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,9 +1,10 @@ #![allow(async_fn_in_trait)] +use postgres_protocol2::Oid; + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use postgres_protocol2::Oid; mod private { pub trait Sealed {} diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 7426279167..c8ebba5487 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,6 +1,8 @@ //! An asynchronous, pipelined, PostgreSQL client. #![warn(clippy::all)] +use postgres_protocol2::message::backend::ReadyForQueryBody; + pub use crate::cancel_token::CancelToken; pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; @@ -17,7 +19,6 @@ pub use crate::tls::NoTls; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; -use postgres_protocol2::message::backend::ReadyForQueryBody; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs index 9a7e248997..4aa838613e 100644 --- a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs +++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs @@ -1,12 +1,14 @@ //! MaybeTlsStream. //! //! Represents a stream that may or may not be encrypted with TLS. -use crate::tls::{ChannelBinding, TlsStream}; use std::io; use std::pin::Pin; use std::task::{Context, Poll}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use crate::tls::{ChannelBinding, TlsStream}; + /// A stream that may or may not be encrypted with TLS. pub enum MaybeTlsStream { /// An unencrypted stream. diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index 58bbb26cbc..b36d2e5f74 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,18 +1,19 @@ +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{TryStreamExt, pin_mut}; +use log::debug; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; + use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::types::{Field, Kind, Oid, Type}; -use crate::{query, slice_iter}; -use crate::{Column, Error, Statement}; -use bytes::Bytes; -use fallible_iterator::FallibleIterator; -use futures_util::{pin_mut, TryStreamExt}; -use log::debug; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; -use std::future::Future; -use std::pin::Pin; -use std::sync::Arc; +use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs index e21631c85d..29f05fba79 100644 --- a/libs/proxy/tokio-postgres2/src/query.rs +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -1,22 +1,24 @@ -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::IsNull; -use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; -use bytes::{BufMut, Bytes, BytesMut}; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Stream}; -use log::{debug, log_enabled, Level}; -use pin_project_lite::pin_project; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; -use postgres_types2::{Format, ToSql, Type}; use std::fmt; use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::{BufMut, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use futures_util::{Stream, ready}; +use log::{Level, debug, log_enabled}; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use postgres_types2::{Format, ToSql, Type}; + +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::IsNull; +use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; + struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); impl fmt::Debug for BorrowToSqlParamsDebug<'_> { @@ -257,7 +259,7 @@ impl Stream for RowStream { this.statement.clone(), body, *this.output_format, - )?))) + )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs index 10e130707d..5fc955eef4 100644 --- a/libs/proxy/tokio-postgres2/src/row.rs +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -1,17 +1,18 @@ //! Rows. +use std::ops::Range; +use std::sync::Arc; +use std::{fmt, str}; + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::DataRowBody; +use postgres_types2::{Format, WrongFormat}; + use crate::row::sealed::{AsName, Sealed}; use crate::simple_query::SimpleColumn; use crate::statement::Column; use crate::types::{FromSql, Type, WrongType}; use crate::{Error, Statement}; -use fallible_iterator::FallibleIterator; -use postgres_protocol2::message::backend::DataRowBody; -use postgres_types2::{Format, WrongFormat}; -use std::fmt; -use std::ops::Range; -use std::str; -use std::sync::Arc; mod sealed { pub trait Sealed {} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index fb2550377b..f13d63983f 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -1,19 +1,21 @@ -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; -use bytes::Bytes; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Stream}; -use log::debug; -use pin_project_lite::pin_project; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{Stream, ready}; +use log::debug; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; + +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; + /// Information about a column of a single query row. #[derive(Debug)] pub struct SimpleColumn { diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 591872fbc5..e4828db712 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -1,15 +1,14 @@ +use std::fmt; +use std::sync::{Arc, Weak}; + +use postgres_protocol2::Oid; +use postgres_protocol2::message::backend::Field; +use postgres_protocol2::message::frontend; + use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::types::Type; -use postgres_protocol2::{ - message::{backend::Field, frontend}, - Oid, -}; -use std::{ - fmt, - sync::{Arc, Weak}, -}; struct StatementInner { client: Weak, diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs index dc8140719f..41b51368ff 100644 --- a/libs/proxy/tokio-postgres2/src/tls.rs +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -5,6 +5,7 @@ use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use std::{fmt, io}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pub(crate) mod private { diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index 03a57e4947..eecbfc5873 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -1,8 +1,9 @@ +use postgres_protocol2::message::frontend; + use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; -use postgres_protocol2::message::frontend; /// A representation of a PostgreSQL database transaction. /// diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 33fa6e89f5..7bdf340f74 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "remote_storage" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 9027a8bf55..dee61a410d 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -2,33 +2,26 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::env; use std::fmt::Display; -use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; +use std::{env, io}; -use super::REMOTE_STORAGE_PREFIX_SEPARATOR; -use anyhow::Context; -use anyhow::Result; +use anyhow::{Context, Result}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; -use azure_core::HttpClient; -use azure_core::TransportOptions; -use azure_core::{Continuable, RetryOptions}; +use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; -use azure_storage_blobs::prelude::ClientBuilder; -use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use bytes::Bytes; +use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; -use futures::FutureExt; -use futures_util::StreamExt; -use futures_util::TryStreamExt; +use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -36,12 +29,13 @@ use tracing::debug; use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; -use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; -use crate::DownloadKind; +use super::REMOTE_STORAGE_PREFIX_SEPARATOR; +use crate::config::AzureConfig; +use crate::error::Cancelled; +use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ - config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, - DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, - TimeTravelError, TimeoutOrCancel, + ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, + ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index ff34158c9c..52978be5b4 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,8 +1,10 @@ -use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; +use std::fmt::Debug; +use std::num::NonZeroUsize; +use std::str::FromStr; +use std::time::Duration; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; - use serde::{Deserialize, Serialize}; use crate::{ diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 69b522d63e..6eb5570d9b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -18,40 +18,35 @@ mod s3_bucket; mod simulate_failures; mod support; -use std::{ - collections::HashMap, - fmt::Debug, - num::NonZeroU32, - ops::Bound, - pin::{pin, Pin}, - sync::Arc, - time::SystemTime, -}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::pin::{Pin, pin}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; - +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; use bytes::Bytes; -use futures::{stream::Stream, StreamExt}; +use camino::{Utf8Path, Utf8PathBuf}; +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +use futures::StreamExt; +use futures::stream::Stream; use itertools::Itertools as _; +use s3_bucket::RequestKind; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::info; -pub use self::{ - azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket, - simulate_failures::UnreliableWrapper, -}; -use s3_bucket::RequestKind; - +pub use self::azure_blob::AzureBlobStorage; +pub use self::local_fs::LocalFs; +pub use self::s3_bucket::S3Bucket; +pub use self::simulate_failures::UnreliableWrapper; pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; -/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. -pub use azure_core::Etag; - -pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; - /// Default concurrency limit for S3 operations /// /// Currently, sync happens with AWS S3, that has two limits on requests per second: @@ -640,8 +635,13 @@ impl GenericRemoteStorage { let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + info!( + "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", + s3_config.bucket_name, + s3_config.bucket_region, + s3_config.prefix_in_bucket, + s3_config.endpoint + ); Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { @@ -649,8 +649,12 @@ impl GenericRemoteStorage { .storage_account .as_deref() .unwrap_or(""); - info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", - azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); + info!( + "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", + azure_config.container_name, + azure_config.container_region, + azure_config.prefix_in_container + ); Self::AzureBlob(Arc::new(AzureBlobStorage::new( azure_config, timeout, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index a8b00173ba..f03d6ac8ee 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,31 +4,26 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{ - collections::HashSet, - io::ErrorKind, - num::NonZeroU32, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; +use std::collections::HashSet; +use std::io::ErrorKind; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::Stream; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, -}; -use tokio_util::{io::ReaderStream, sync::CancellationToken}; +use tokio::fs; +use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::ReaderStream; +use tokio_util::sync::CancellationToken; use utils::crashsafe::path_with_suffix_extension; -use crate::{ - Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, - TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - use super::{RemoteStorage, StorageMetadata}; -use crate::Etag; +use crate::{ + Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject, + REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel, +}; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; @@ -91,7 +86,8 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - use std::{future::Future, pin::Pin}; + use std::future::Future; + use std::pin::Pin; fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> @@ -284,7 +280,9 @@ impl LocalFs { })?; if bytes_read < from_size_bytes { - bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + bail!( + "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes" + ); } // Check if there is any extra data after the given size. let mut from = buffer_to_read.into_inner(); @@ -642,10 +640,13 @@ fn mock_etag(meta: &std::fs::Metadata) -> Etag { #[cfg(test)] mod fs_tests { - use super::*; + use std::collections::HashMap; + use std::io::Write; + use std::ops::Bound; use camino_tempfile::tempdir; - use std::{collections::HashMap, io::Write, ops::Bound}; + + use super::*; async fn read_and_check_metadata( storage: &LocalFs, @@ -736,9 +737,14 @@ mod fs_tests { ); let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; - match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await { + match storage + .download(&non_existing_path, &DownloadOpts::default(), &cancel) + .await + { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys - other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), + other => panic!( + "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}" + ), } Ok(()) } diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index 48c121fbc8..81e68e9a29 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -1,5 +1,5 @@ use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter, + Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec, }; use once_cell::sync::Lazy; @@ -16,8 +16,8 @@ pub(crate) enum RequestKind { Head = 6, } -use scopeguard::ScopeGuard; use RequestKind::*; +use scopeguard::ScopeGuard; impl RequestKind { const fn as_str(&self) -> &'static str { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d3f19f0b11..ba7ce9e1e7 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,56 +4,50 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{ - borrow::Cow, - collections::HashMap, - num::NonZeroU32, - pin::Pin, - sync::Arc, - task::{Context, Poll}, - time::{Duration, SystemTime}, -}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context as _}; -use aws_config::{ - default_provider::credentials::DefaultCredentialsChain, - retry::{RetryConfigBuilder, RetryMode}, - BehaviorVersion, -}; -use aws_sdk_s3::{ - config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, - error::SdkError, - operation::{get_object::GetObjectError, head_object::HeadObjectError}, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, - Client, -}; +use anyhow::{Context as _, anyhow}; +use aws_config::BehaviorVersion; +use aws_config::default_provider::credentials::DefaultCredentialsChain; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; +use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}; +use aws_sdk_s3::error::SdkError; +use aws_sdk_s3::operation::get_object::GetObjectError; +use aws_sdk_s3::operation::head_object::HeadObjectError; +use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}; use aws_smithy_async::rt::sleep::TokioSleep; -use http_body_util::StreamBody; -use http_types::StatusCode; - -use aws_smithy_types::{body::SdkBody, DateTime}; -use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; +use aws_smithy_types::DateTime; +use aws_smithy_types::body::SdkBody; +use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::date_time::ConversionError; use bytes::Bytes; use futures::stream::Stream; use futures_util::StreamExt; +use http_body_util::StreamBody; +use http_types::StatusCode; use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; use super::StorageMetadata; -use crate::{ - config::S3Config, - error::Cancelled, - metrics::{start_counting_cancelled_wait, start_measuring_requests}, - support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, - RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3, - REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - -use crate::metrics::AttemptOutcome; +use crate::config::S3Config; +use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; +use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; +use crate::support::PermitCarrying; +use crate::{ + ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, + MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, + TimeTravelError, TimeoutOrCancel, +}; /// AWS S3 storage. pub struct S3Bucket { @@ -958,8 +952,10 @@ impl RemoteStorage for S3Bucket { version_id, key, .. } = &vd; if version_id == "null" { - return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"))); + return Err(TimeTravelError::Other(anyhow!( + "Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values" + ))); } tracing::trace!( "Parsing version key={key} version_id={version_id} kind={:?}", @@ -1126,9 +1122,10 @@ impl VerOrDelete { #[cfg(test)] mod tests { - use camino::Utf8Path; use std::num::NonZeroUsize; + use camino::Utf8Path; + use crate::{RemotePath, S3Bucket, S3Config}; #[tokio::test] diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 63c24beb51..f56be873c4 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,14 +1,15 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. -use bytes::Bytes; -use futures::stream::Stream; -use futures::StreamExt; use std::collections::HashMap; +use std::collections::hash_map::Entry; use std::num::NonZeroU32; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::SystemTime; -use std::{collections::hash_map::Entry, sync::Arc}; + +use bytes::Bytes; +use futures::StreamExt; +use futures::stream::Stream; use tokio_util::sync::CancellationToken; use crate::{ diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs index 1ed9ed9305..07da38cf77 100644 --- a/libs/remote_storage/src/support.rs +++ b/libs/remote_storage/src/support.rs @@ -1,9 +1,7 @@ -use std::{ - future::Future, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Duration; use bytes::Bytes; use futures_util::Stream; @@ -114,9 +112,10 @@ pub(crate) fn cancel_or_timeout( #[cfg(test)] mod tests { + use futures::stream::StreamExt; + use super::*; use crate::DownloadError; - use futures::stream::StreamExt; #[tokio::test(start_paused = true)] async fn cancelled_download_stream() { diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index d5da1d48e9..6a78ddc01e 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,19 +1,20 @@ +use std::collections::HashSet; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::sync::Arc; + use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; -use std::ops::Bound; -use std::sync::Arc; -use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; use tokio_util::sync::CancellationToken; use tracing::debug; -use crate::common::{download_to_vec, upload_stream, wrap_stream}; - use super::{ MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, }; +use crate::common::{download_to_vec, upload_stream, wrap_stream}; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. @@ -62,7 +63,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .into_iter() .collect::>(); assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), + root_remote_prefixes, + HashSet::from([base_prefix.clone()]), "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" ); @@ -84,7 +86,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); @@ -119,7 +122,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes_combined) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 15004dbf83..31c9ca3200 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ +use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; -use std::{collections::HashSet, time::Duration}; +use std::time::{Duration, UNIX_EPOCH}; use anyhow::Context; use remote_storage::{ @@ -208,7 +208,7 @@ async fn create_azure_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index e60ec18c93..6996bb27ae 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; use std::env; use std::fmt::{Debug, Display}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; -use std::{collections::HashSet, time::SystemTime}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; @@ -15,12 +14,13 @@ use remote_storage::{ DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; -use test_context::test_context; -use test_context::AsyncTestContext; +use test_context::{AsyncTestContext, test_context}; use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; +use crate::common::{download_to_vec, upload_stream}; + mod common; #[path = "common/tests.rs"] @@ -128,8 +128,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { - panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ - This likely means a large lock discrepancy between S3 and the local clock."); + panic!( + "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock." + ); } } @@ -383,7 +385,7 @@ async fn create_s3_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 6b72ace019..d9d080e8fe 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 8b14a4f290..bb8934744a 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -2,7 +2,8 @@ //! rfcs/035-safekeeper-dynamic-membership-change.md //! for details. -use std::{collections::HashSet, fmt::Display}; +use std::collections::HashSet; +use std::fmt::Display; use anyhow; use anyhow::bail; @@ -68,14 +69,12 @@ impl Display for SafekeeperId { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(transparent)] pub struct MemberSet { - pub members: Vec, + pub m: Vec, } impl MemberSet { pub fn empty() -> Self { - MemberSet { - members: Vec::new(), - } + MemberSet { m: Vec::new() } } pub fn new(members: Vec) -> anyhow::Result { @@ -83,21 +82,21 @@ impl MemberSet { if hs.len() != members.len() { bail!("duplicate safekeeper id in the set {:?}", members); } - Ok(MemberSet { members }) + Ok(MemberSet { m: members }) } - pub fn contains(&self, sk: &SafekeeperId) -> bool { - self.members.iter().any(|m| m.id == sk.id) + pub fn contains(&self, sk: NodeId) -> bool { + self.m.iter().any(|m| m.id == sk) } pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { - if self.contains(&sk) { + if self.contains(sk.id) { bail!(format!( "sk {} is already member of the set {}", sk.id, self )); } - self.members.push(sk); + self.m.push(sk); Ok(()) } } @@ -105,11 +104,7 @@ impl MemberSet { impl Display for MemberSet { /// Display as a comma separated list of members. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let sks_str = self - .members - .iter() - .map(|m| m.to_string()) - .collect::>(); + let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::>(); write!(f, "({})", sks_str.join(", ")) } } @@ -135,6 +130,11 @@ impl Configuration { new_members: None, } } + + /// Is `sk_id` member of the configuration? + pub fn contains(&self, sk_id: NodeId) -> bool { + self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id)) + } } impl Display for Configuration { @@ -154,9 +154,10 @@ impl Display for Configuration { #[cfg(test)] mod tests { - use super::{MemberSet, SafekeeperId}; use utils::id::NodeId; + use super::{MemberSet, SafekeeperId}; + #[test] fn test_member_set() { let mut members = MemberSet::empty(); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 41ccdaa428..2f2aeaa429 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,18 +1,17 @@ //! Types used in safekeeper http API. Many of them are also reused internally. +use std::net::SocketAddr; + use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; -use std::net::SocketAddr; use tokio::time::Instant; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - pageserver_feedback::PageserverFeedback, -}; - -use crate::{membership::Configuration, ServerInfo, Term}; +use crate::membership::Configuration; +use crate::{ServerInfo, Term}; #[derive(Debug, Serialize)] pub struct SafekeeperStatus { diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 62e0f4cfba..5020d82adf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -27,7 +27,7 @@ humantime.workspace = true fail.workspace = true futures = { workspace = true } jsonwebtoken.workspace = true -nix = {workspace = true, features = [ "ioctl" ] } +nix = { workspace = true, features = ["ioctl"] } once_cell.workspace = true pin-project-lite.workspace = true regex.workspace = true @@ -61,6 +61,7 @@ bytes.workspace = true criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true +pprof.workspace = true serde_assert.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md new file mode 100644 index 0000000000..5afbe3cf2b --- /dev/null +++ b/libs/utils/benches/README.md @@ -0,0 +1,26 @@ +## Utils Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package utils + +# Specific file. +cargo bench --package utils --bench benchmarks + +# Specific benchmark. +cargo bench --package utils --bench benchmarks log_slow/enabled=true + +# List available benchmarks. +cargo bench --package utils --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 44eb36387c..348e27ac47 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,18 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pprof::criterion::{Output, PProfProfiler}; use utils::id; +use utils::logging::log_slow; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_id_stringify, + bench_log_slow, +); +criterion_main!(benches); pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. @@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -criterion_group!(benches, bench_id_stringify); -criterion_main!(benches); +pub fn bench_log_slow(c: &mut Criterion) { + for enabled in [false, true] { + c.bench_function(&format!("log_slow/enabled={enabled}"), |b| { + run_bench(b, enabled).unwrap() + }); + } + + // The actual benchmark. + fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> { + const THRESHOLD: Duration = Duration::from_secs(1); + + // Use a multi-threaded runtime to avoid thread parking overhead when yielding. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + // Test both with and without log_slow, since we're essentially measuring Tokio scheduling + // performance too. Use a simple noop future that yields once, to avoid any scheduler fast + // paths for a ready future. + if enabled { + b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now()))); + } else { + b.iter(|| runtime.block_on(tokio::task::yield_now())); + } + + Ok(()) + } +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 4a6069294d..2c36942f43 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,9 +1,13 @@ +use std::future::Future; use std::str::FromStr; +use std::time::Duration; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +use tokio::time::Instant; +use tracing::info; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -318,6 +322,43 @@ impl std::fmt::Debug for SecretString { } } +/// Logs a periodic message if a future is slow to complete. +/// +/// This is performance-sensitive as it's used on the GetPage read path. +/// +/// TODO: consider upgrading this to a warning, but currently it fires too often. +#[inline] +pub async fn log_slow(name: &str, threshold: Duration, f: impl Future) -> O { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut f = Box::pin(f); + + let started = Instant::now(); + let mut attempt = 1; + + loop { + // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common + // case where the timeout doesn't fire. + let deadline = started + attempt * threshold; + if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { + // NB: we check if we exceeded the threshold even if the timeout never fired, because + // scheduling or execution delays may cause the future to succeed even if it exceeds the + // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid + // false negatives. + let elapsed = started.elapsed(); + if elapsed >= threshold { + info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); + } + return output; + } + + let elapsed = started.elapsed().as_secs_f64(); + info!("slow {name} still running after {elapsed:.3}s",); + + attempt += 1; + } +} + #[cfg(test)] mod tests { use metrics::{core::Opts, IntCounterVec}; diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index ba73902d38..a70465921c 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "vm_monitor" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 1d70cedcf9..dda9b23818 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -1,12 +1,10 @@ use std::fmt::{self, Debug, Formatter}; use std::time::{Duration, Instant}; -use anyhow::{anyhow, Context}; -use cgroups_rs::{ - hierarchies::{self, is_cgroup2_unified_mode}, - memory::MemController, - Subsystem, -}; +use anyhow::{Context, anyhow}; +use cgroups_rs::Subsystem; +use cgroups_rs::hierarchies::{self, is_cgroup2_unified_mode}; +use cgroups_rs::memory::MemController; use tokio::sync::watch; use tracing::{info, warn}; diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index c81848cb70..7b7201ab77 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -6,17 +6,15 @@ //! the cgroup (requesting upscale), and the signals that go to the cgroup //! (notifying it of upscale). -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use axum::extract::ws::{Message, Utf8Bytes, WebSocket}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; use tracing::{debug, info}; use crate::protocol::{ - OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion, - PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, + OutboundMsg, OutboundMsgKind, PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, ProtocolRange, + ProtocolResponse, ProtocolVersion, }; /// The central handler for all communications in the monitor. diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index 4f5bf1c1e3..bc42347e5a 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -2,12 +2,14 @@ use std::num::NonZeroU64; -use crate::MiB; -use anyhow::{anyhow, Context}; -use tokio_postgres::{types::ToSql, Client, NoTls, Row}; +use anyhow::{Context, anyhow}; +use tokio_postgres::types::ToSql; +use tokio_postgres::{Client, NoTls, Row}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; +use crate::MiB; + /// Manages Postgres' file cache by keeping a connection open. #[derive(Debug)] pub struct FileCacheState { diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 0cd97d4ca1..7c77aca35d 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -2,24 +2,26 @@ #![deny(clippy::undocumented_unsafe_blocks)] #![cfg(target_os = "linux")] +use std::fmt::Debug; +use std::net::SocketAddr; +use std::time::Duration; + use anyhow::Context; -use axum::{ - extract::{ws::WebSocket, State, WebSocketUpgrade}, - response::Response, -}; -use axum::{routing::get, Router}; +use axum::Router; +use axum::extract::ws::WebSocket; +use axum::extract::{State, WebSocketUpgrade}; +use axum::response::Response; +use axum::routing::get; use clap::Parser; use futures::Future; -use std::net::SocketAddr; -use std::{fmt::Debug, time::Duration}; +use runner::Runner; use sysinfo::{RefreshKind, System, SystemExt}; use tokio::net::TcpListener; -use tokio::{sync::broadcast, task::JoinHandle}; +use tokio::sync::broadcast; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{error, info}; -use runner::Runner; - // Code that interfaces with agent pub mod dispatcher; pub mod protocol; diff --git a/libs/vm_monitor/src/protocol.rs b/libs/vm_monitor/src/protocol.rs index 5f07435503..4fce3cdefc 100644 --- a/libs/vm_monitor/src/protocol.rs +++ b/libs/vm_monitor/src/protocol.rs @@ -35,7 +35,8 @@ use core::fmt; use std::cmp; -use serde::{de::Error, Deserialize, Serialize}; +use serde::de::Error; +use serde::{Deserialize, Serialize}; /// A Message we send to the agent. #[derive(Serialize, Deserialize, Debug, Clone)] diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 8839f5803f..6f75ff0abd 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::time::{Duration, Instant}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; @@ -18,7 +18,7 @@ use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; use crate::filecache::{FileCacheConfig, FileCacheState}; use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources}; -use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB}; +use crate::{Args, MiB, bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel}; /// Central struct that interacts with agent, dispatcher, and cgroup to handle /// signals from the agent. @@ -233,7 +233,9 @@ impl Runner { // // TODO: make the duration here configurable. if last_time.elapsed() > Duration::from_secs(5) { - bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information"); + bail!( + "haven't gotten cgroup memory stats recently enough to determine downscaling information" + ); } else if last_history.samples_count <= 1 { let status = "haven't received enough cgroup memory stats yet"; info!(status, "discontinuing downscale"); diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto index d68484d30f..7b40201a75 100644 --- a/libs/wal_decoder/proto/interpreted_wal.proto +++ b/libs/wal_decoder/proto/interpreted_wal.proto @@ -5,6 +5,7 @@ package interpreted_wal; message InterpretedWalRecords { repeated InterpretedWalRecord records = 1; optional uint64 next_record_lsn = 2; + optional uint64 raw_wal_start_lsn = 3; } message InterpretedWalRecord { diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index 51bf7e44ab..7e1934c6c3 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -60,7 +60,11 @@ pub struct InterpretedWalRecords { pub records: Vec, // Start LSN of the next record after the batch. // Note that said record may not belong to the current shard. - pub next_record_lsn: Option, + pub next_record_lsn: Lsn, + // Inclusive start LSN of the PG WAL from which the interpreted + // WAL records were extracted. Note that this is not necessarily the + // start LSN of the first interpreted record in the batch. + pub raw_wal_start_lsn: Option, } /// An interpreted Postgres WAL record, ready to be handled by the pageserver diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 944ee5c919..52ed5c70b5 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -167,7 +167,8 @@ impl TryFrom for proto::InterpretedWalRecords { .collect::, _>>()?; Ok(proto::InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(|l| l.0), + next_record_lsn: Some(value.next_record_lsn.0), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0), }) } } @@ -254,7 +255,11 @@ impl TryFrom for InterpretedWalRecords { Ok(InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(Lsn::from), + next_record_lsn: value + .next_record_lsn + .map(Lsn::from) + .expect("Always provided"), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from), }) } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index ba75171db2..60b606c64a 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -215,6 +215,7 @@ impl Wrapper { syncSafekeepers: config.sync_safekeepers, systemId: 0, pgTimeline: 1, + proto_version: 3, callback_data, }; let c_config = Box::into_raw(Box::new(c_config)); @@ -276,6 +277,7 @@ mod tests { use core::panic; use std::{ cell::Cell, + ffi::CString, sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; @@ -496,57 +498,64 @@ mod tests { // Messages definitions are at walproposer.h // xxx: it would be better to extract them from safekeeper crate and // use serialization/deserialization here. - let greeting_tag = (b'g' as u64).to_ne_bytes(); - let proto_version = 2_u32.to_ne_bytes(); - let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); - let proposer_id = [0; 16]; - let system_id = 0_u64.to_ne_bytes(); - let tenant_id = ttid.tenant_id.as_arr(); - let timeline_id = ttid.timeline_id.as_arr(); - let pg_tli = 1_u32.to_ne_bytes(); - let wal_seg_size = 16777216_u32.to_ne_bytes(); + let greeting_tag = (b'g').to_be_bytes(); + let tenant_id = CString::new(ttid.tenant_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let timeline_id = CString::new(ttid.timeline_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let mconf_gen = 0_u32.to_be_bytes(); + let mconf_members_len = 0_u32.to_be_bytes(); + let mconf_members_new_len = 0_u32.to_be_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes(); + let system_id = 0_u64.to_be_bytes(); + let wal_seg_size = 16777216_u32.to_be_bytes(); + let proposer_greeting = [ greeting_tag.as_slice(), - proto_version.as_slice(), - pg_version.as_slice(), - proposer_id.as_slice(), - system_id.as_slice(), tenant_id.as_slice(), timeline_id.as_slice(), - pg_tli.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + pg_version.as_slice(), + system_id.as_slice(), wal_seg_size.as_slice(), ] .concat(); - let voting_tag = (b'v' as u64).to_ne_bytes(); - let vote_request_term = 3_u64.to_ne_bytes(); - let proposer_id = [0; 16]; + let voting_tag = (b'v').to_be_bytes(); + let vote_request_term = 3_u64.to_be_bytes(); let vote_request = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_request_term.as_slice(), - proposer_id.as_slice(), ] .concat(); - let acceptor_greeting_term = 2_u64.to_ne_bytes(); - let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting_term = 2_u64.to_be_bytes(); + let acceptor_greeting_node_id = 1_u64.to_be_bytes(); let acceptor_greeting = [ greeting_tag.as_slice(), - acceptor_greeting_term.as_slice(), acceptor_greeting_node_id.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + acceptor_greeting_term.as_slice(), ] .concat(); - let vote_response_term = 3_u64.to_ne_bytes(); - let vote_given = 1_u64.to_ne_bytes(); - let flush_lsn = 0x539_u64.to_ne_bytes(); - let truncate_lsn = 0x539_u64.to_ne_bytes(); - let th_len = 1_u32.to_ne_bytes(); - let th_term = 2_u64.to_ne_bytes(); - let th_lsn = 0x539_u64.to_ne_bytes(); - let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response_term = 3_u64.to_be_bytes(); + let vote_given = 1_u8.to_be_bytes(); + let flush_lsn = 0x539_u64.to_be_bytes(); + let truncate_lsn = 0x539_u64.to_be_bytes(); + let th_len = 1_u32.to_be_bytes(); + let th_term = 2_u64.to_be_bytes(); + let th_lsn = 0x539_u64.to_be_bytes(); let vote_response = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_response_term.as_slice(), vote_given.as_slice(), flush_lsn.as_slice(), @@ -554,7 +563,6 @@ mod tests { th_len.as_slice(), th_term.as_slice(), th_lsn.as_slice(), - timeline_start_lsn.as_slice(), ] .concat(); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 41ac3b69b8..7330856be4 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -40,7 +40,6 @@ num_cpus.workspace = true num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true -postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index b67a9cc479..b1103948d6 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,20 @@ -use std::{env, num::NonZeroUsize}; +use std::env; +use std::num::NonZeroUsize; use bytes::Bytes; use camino::Utf8PathBuf; -use criterion::{criterion_group, criterion_main, Criterion}; -use pageserver::{ - config::PageServerConf, - context::{DownloadBehavior, RequestContext}, - l0_flush::{L0FlushConfig, L0FlushGlobalState}, - page_cache, - task_mgr::TaskKind, - tenant::storage_layer::InMemoryLayer, - virtual_file, -}; -use pageserver_api::{key::Key, shard::TenantShardId, value::Value}; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, -}; +use criterion::{Criterion, criterion_group, criterion_main}; +use pageserver::config::PageServerConf; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5c5b52db44..e11af49449 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,23 +1,21 @@ -use criterion::measurement::WallTime; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; -use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::storage_layer::PersistentLayerDesc; -use pageserver_api::key::Key; -use pageserver_api::shard::TenantShardId; -use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use std::time::Instant; + +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; +use pageserver::keyspace::{KeyPartitioning, KeySpace}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use utils::id::{TenantId, TimelineId}; - use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; - fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index d3551b56e1..77b3f90b3e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -56,20 +56,23 @@ //! medium/128 time: [10.412 ms 10.574 ms 10.718 ms] //! ``` +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; -use pageserver::{config::PageServerConf, walredo::PostgresRedoManager}; +use pageserver::config::PageServerConf; +use pageserver::walredo::PostgresRedoManager; +use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{key::Key, shard::TenantShardId}; -use std::{ - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio::{sync::Barrier, task::JoinSet}; -use utils::{id::TenantId, lsn::Lsn}; +use pageserver_api::shard::TenantShardId; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use utils::id::TenantId; +use utils::lsn::Lsn; fn bench(c: &mut Criterion) { macro_rules! bench_group { diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs index ed5daa8ae1..8de06a6c25 100644 --- a/pageserver/benches/upload_queue.rs +++ b/pageserver/benches/upload_queue.rs @@ -1,15 +1,15 @@ //! Upload queue benchmarks. use std::str::FromStr as _; -use std::sync::atomic::AtomicU32; use std::sync::Arc; +use std::sync::atomic::AtomicU32; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; +use pageserver::tenant::IndexPart; use pageserver::tenant::metadata::TimelineMetadata; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; -use pageserver::tenant::IndexPart; use pprof::criterion::{Output, PProfProfiler}; use utils::generation::Generation; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index db77a395e0..970a437a42 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -21,5 +21,4 @@ tokio.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true -postgres.workspace = true bytes.workspace = true diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 27280912b4..47da83b0eb 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -34,7 +34,8 @@ pub struct BasebackupRequest { impl Client { pub async fn new(connstring: String) -> anyhow::Result { - let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?; + let (client, connection) = + tokio_postgres::connect(&connstring, tokio_postgres::NoTls).await?; let conn_task_cancel = CancellationToken::new(); let conn_task = tokio::spawn({ diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 6b739d85a7..7e4e3042b3 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -221,12 +221,12 @@ where // performed implicitly when `top` is dropped). if let Some(mut top) = this.heap.peek_mut() { match top.deref_mut() { - LazyLoadLayer::Unloaded(ref mut l) => { + LazyLoadLayer::Unloaded(l) => { let fut = l.load_keys(this.ctx); this.load_future.set(Some(Box::pin(fut))); continue; } - LazyLoadLayer::Loaded(ref mut entries) => { + LazyLoadLayer::Loaded(entries) => { let result = entries.pop_front().unwrap(); if entries.is_empty() { std::collections::binary_heap::PeekMut::pop(top); diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index af4b5a21ab..c7f0719c41 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -345,6 +345,7 @@ impl AuxFileV2 { AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) } (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash), (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), (0xff, 0xff) => AuxFileV2::Other(hash), _ => return None, diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 4aa6950782..ebe7bc031d 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -40,9 +40,7 @@ impl Stats { } } pub(crate) fn add(&mut self, other: &Self) { - let Self { - ref mut latency_histo, - } = self; + let Self { latency_histo } = self; latency_histo.add(&other.latency_histo).unwrap(); } } diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs index 66ca7fd057..c4b8d9acba 100644 --- a/pageserver/src/assert_u64_eq_usize.rs +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -2,7 +2,9 @@ pub(crate) const _ASSERT_U64_EQ_USIZE: () = { if std::mem::size_of::() != std::mem::size_of::() { - panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + panic!( + "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information" + ); } }; diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5e527b7d61..b76c0e045f 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; -use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE}; use tracing::warn; // BEGIN Copyright (c) 2017 Servo Contributors @@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key const AUX_DIR_PG_LOGICAL: u8 = 0x01; const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_STAT: u8 = 0x03; const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// Encode the aux file into a fixed-size key. @@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// * pg_logical/replorigin_checkpoint -> 0x0103 /// * pg_logical/others -> 0x01FF /// * pg_replslot/ -> 0x0201 +/// * pg_stat/pgstat.stat -> 0x0301 /// * others -> 0xFFFF /// /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. @@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_replslot/") { aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_stat/") { + aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes()) } else { if cfg!(debug_assertions) { warn!( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index e03b1bbe96..ce54bd9c1c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,33 +10,31 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, Context}; -use bytes::{BufMut, Bytes, BytesMut}; -use fail::fail_point; -use pageserver_api::key::{rel_block_to_key, Key}; -use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; + +use anyhow::{Context, anyhow}; +use bytes::{BufMut, Bytes, BytesMut}; +use fail::fail_point; +use pageserver_api::key::{Key, rel_block_to_key}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants::{ + DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES, +}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; +use postgres_ffi::{ + BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, +}; use tokio::io; use tokio::io::AsyncWrite; -use tracing::*; - use tokio_tar::{Builder, EntryType, Header}; +use tracing::*; +use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; -use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::Timeline; -use pageserver_api::reltag::{RelTag, SlruKind}; - -use postgres_ffi::dispatch_pgversion; -use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; -use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::XLogFileName; -use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; +use crate::tenant::storage_layer::IoConcurrency; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { @@ -264,6 +262,31 @@ where async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + // Construct the pg_control file from the persisted checkpoint and pg_control + // information. But we only add this to the tarball at the end, so that if the + // writing is interrupted half-way through, the resulting incomplete tarball will + // be missing the pg_control file, which prevents PostgreSQL from starting up on + // it. With proper error handling, you should never try to start up from an + // incomplete basebackup in the first place, of course, but this is a nice little + // extra safety measure. + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn, self.ctx) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn, self.ctx) + .await + .context("failed to get control bytes")?; + let (pg_control_bytes, system_identifier, was_shutdown) = + postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; let pgversion = self.timeline.pg_version; @@ -401,6 +424,10 @@ where // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; + } else if path == "pg_stat/pgstat.stat" && !was_shutdown { + // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN + // of a shutdown checkpoint. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar @@ -462,8 +489,9 @@ where ))) }); - // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file().await?; + // Last, add the pg_control file and bootstrap WAL segment. + self.add_pgcontrol_file(pg_control_bytes, system_identifier) + .await?; self.ar .finish() .await @@ -671,7 +699,11 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { + async fn add_pgcontrol_file( + &mut self, + pg_control_bytes: Bytes, + system_identifier: u64, + ) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -694,24 +726,6 @@ where .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn, self.ctx) - .await - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn, self.ctx) - .await - .context("failed get control bytes")?; - - let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( - &pg_control_bytes, - &checkpoint_bytes, - self.lsn, - self.timeline.pg_version, - )?; - //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fa098e9364..703629aed5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,49 +3,42 @@ //! Main entry point for the Page Server executable. use std::env; -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::io::Read; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; - -use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::config::PageserverIdentity; +use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; +use metrics::set_build_info_metric; +use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; +use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; -use pageserver::tenant::{secondary, TenantSharedResources}; -use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; +use pageserver::task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, +}; +use pageserver::tenant::{TenantSharedResources, mgr, secondary}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service, + task_mgr, virtual_file, +}; +use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; - -use metrics::set_build_info_metric; -use pageserver::{ - config::PageServerConf, - deletion_queue::DeletionQueue, - http, page_cache, page_service, task_mgr, - task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, - tenant::mgr, - virtual_file, -}; -use postgres_backend::AuthType; +use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; -use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::{ - auth::{JwtAuth, SwappableJwtAuth}, - logging, project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, -}; +use utils::sentry_init::init_sentry; +use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -57,7 +50,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; @@ -85,6 +78,9 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // Initialize up failpoints support + let scenario = failpoint_support::init(); + let workdir = arg_matches .get_one::("workdir") .map(Utf8Path::new) @@ -134,6 +130,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); @@ -177,9 +174,6 @@ fn main() -> anyhow::Result<()> { } } - // Initialize up failpoints support - let scenario = failpoint_support::init(); - // Basic initialization of things that don't change after startup tracing::info!("Initializing virtual_file..."); virtual_file::init( @@ -216,7 +210,9 @@ fn initialize_config( Ok(mut f) => { let md = f.metadata().context("stat config file")?; if !md.is_file() { - anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); + anyhow::bail!( + "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..." + ); } let mut s = String::new(); @@ -224,7 +220,9 @@ fn initialize_config( toml_edit::de::from_str::(&s)? } Err(e) => { - anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + anyhow::bail!( + "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..." + ); } }; @@ -350,6 +348,13 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. + // These are configured via net.ipv4.tcp_keepalive_* sysctls. + // + // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't + // support enabling keepalives while using the default OS sysctls. + setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; + // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME @@ -400,11 +405,9 @@ fn start_pageserver( Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { - return Err(e).with_context(|| { - "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" - }) - } + Err(e) => return Err(e).with_context( + || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable", + ), }; // Top-level cancellation token for the process @@ -710,7 +713,9 @@ async fn create_remote_storage_client( // wrapper that simulates failures. if conf.test_remote_failures > 0 { if !cfg!(feature = "testing") { - anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + anyhow::bail!( + "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature" + ); } info!( "Simulating remote failures for first {} attempts of each op", diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs index c1ce332b6c..0215dd06fb 100644 --- a/pageserver/src/bin/test_helper_slow_client_reads.rs +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -1,14 +1,10 @@ -use std::{ - io::{stdin, stdout, Read, Write}, - time::Duration, -}; +use std::io::{Read, Write, stdin, stdout}; +use std::time::Duration; use clap::Parser; use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; #[derive(clap::Parser)] struct Args { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c5368f6806..64d00882b9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,36 +4,29 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context}; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::{ - config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, - shard::TenantShardId, -}; -use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; -use storage_broker::Uri; -use utils::logging::SecretString; -use utils::postgres_client::PostgresClientProtocol; - -use once_cell::sync::OnceCell; -use reqwest::Url; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; +use once_cell::sync::OnceCell; +use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; use postgres_backend::AuthType; -use utils::{ - id::{NodeId, TimelineId}, - logging::LogFormat, -}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use reqwest::Url; +use storage_broker::Uri; +use utils::id::{NodeId, TimelineId}; +use utils::logging::{LogFormat, SecretString}; +use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::virtual_file; use crate::virtual_file::io_engine; -use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// Global state of pageserver. /// @@ -197,6 +190,10 @@ pub struct PageServerConf { /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer /// files read. pub enable_read_path_debugging: bool, + + /// Interpreted protocol feature: if enabled, validate that the logical WAL received from + /// safekeepers does not have gaps. + pub validate_wal_contiguity: bool, } /// Token for authentication to safekeepers @@ -360,6 +357,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, + validate_wal_contiguity, } = config_toml; let mut conf = PageServerConf { @@ -435,7 +433,9 @@ impl PageServerConf { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + eprintln!( + "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}" + ); engine } }, @@ -446,6 +446,7 @@ impl PageServerConf { virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), + validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7e8c00c293..0231190e69 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,13 +1,9 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. -use crate::config::PageServerConf; -use crate::consumption_metrics::metrics::MetricsKey; -use crate::consumption_metrics::upload::KeyGen as _; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::size::CalculateSyntheticSizeError; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + use camino::Utf8PathBuf; use consumption_metrics::EventType; use itertools::Itertools as _; @@ -15,14 +11,21 @@ use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, SystemTime}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; +use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; + mod disk_cache; mod metrics; mod upload; diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs index 54a505a134..f1dad8793d 100644 --- a/pageserver/src/consumption_metrics/disk_cache.rs +++ b/pageserver/src/consumption_metrics/disk_cache.rs @@ -1,10 +1,10 @@ -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; use std::sync::Arc; -use crate::consumption_metrics::NewMetricsRefRoot; +use anyhow::Context; +use camino::{Utf8Path, Utf8PathBuf}; use super::{NewMetricsRoot, NewRawMetric, RawMetric}; +use crate::consumption_metrics::NewMetricsRefRoot; pub(super) fn read_metrics_from_serde_value( json_value: serde_json::Value, diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 07fac09f6f..71910011ea 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,15 +1,16 @@ -use crate::tenant::mgr::TenantManager; -use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; +use std::sync::Arc; +use std::time::SystemTime; + use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use std::{sync::Arc, time::SystemTime}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; +use crate::context::RequestContext; +use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` /// instead of static str. diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 3ed7b44123..52b4fb8680 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,7 @@ -use crate::consumption_metrics::RawMetric; +use std::collections::HashMap; use super::*; -use std::collections::HashMap; +use crate::consumption_metrics::RawMetric; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 448bf47525..59e0145a5b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -2,15 +2,16 @@ use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; -use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; - -use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric}; use utils::id::{TenantId, TimelineId}; +use super::metrics::Name; +use super::{Cache, MetricsKey, NewRawMetric, RawMetric}; + /// How the metrics from pageserver are identified. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] struct Ids { @@ -438,14 +439,13 @@ async fn upload( #[cfg(test)] mod tests { - use crate::consumption_metrics::{ - disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot, - }; - - use super::*; use chrono::{DateTime, Utc}; use once_cell::sync::Lazy; + use super::*; + use crate::consumption_metrics::NewMetricsRefRoot; + use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value; + #[test] fn chunked_serialization() { let examples = metric_samples(); diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index d41bfd9021..8462594607 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -1,21 +1,23 @@ use std::collections::HashMap; use futures::Future; -use pageserver_api::{ - controller_api::{AvailabilityZone, NodeRegisterRequest}, - shard::TenantShardId, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateRequestTenant, ValidateResponse, - }, +use pageserver_api::config::NodeMetadata; +use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest}; +use pageserver_api::shard::TenantShardId; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::{backoff, failpoint_support}; -use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; -use pageserver_api::config::NodeMetadata; +use crate::config::PageServerConf; +use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). @@ -157,14 +159,18 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { match az_id_from_metadata { Some(az_id) => Some(AvailabilityZone(az_id)), None => { - tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + tracing::warn!( + "metadata.json does not contain an 'availability_zone_id' field" + ); conf.availability_zone.clone().map(AvailabilityZone) } } }; if az_id.is_none() { - panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + panic!( + "Availablity zone id could not be inferred from metadata.json or pageserver config" + ); } Some(NodeRegisterRequest { @@ -173,6 +179,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + listen_https_port: None, // TODO: Support https. availability_zone_id: az_id.expect("Checked above"), }) } @@ -235,7 +242,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .iter() .map(|(id, generation)| ValidateRequestTenant { id: *id, - gen: (*generation).into().expect( + r#gen: (*generation).into().expect( "Generation should always be valid for a Tenant doing deletions", ), }) diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index a2395b0dca..8118f66252 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,38 +6,31 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::metrics; -use crate::tenant::remote_timeline_client::remote_timeline_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::virtual_file::MaybeFatalIo; -use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; -use serde::Deserialize; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio_util::sync::CancellationToken; -use tracing::Instrument; -use tracing::{debug, error}; +use tracing::{Instrument, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; -use utils::lsn::AtomicLsn; -use utils::lsn::Lsn; - -use self::deleter::Deleter; -use self::list_writer::DeletionOp; -use self::list_writer::ListWriter; -use self::list_writer::RecoverOp; -use self::validator::Validator; -use deleter::DeleterMessage; -use list_writer::ListWriterQueueMessage; +use utils::lsn::{AtomicLsn, Lsn}; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; +use self::deleter::Deleter; +use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; +use self::validator::Validator; +use crate::config::PageServerConf; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; +use crate::tenant::storage_layer::LayerName; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; // TODO: configurable for how long to wait before executing deletions @@ -664,21 +657,22 @@ impl DeletionQueue { #[cfg(test)] mod test { + use std::io::ErrorKind; + use std::time::Duration; + use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant}; - use std::{io::ErrorKind, time::Duration}; - use tracing::info; - + use pageserver_api::key::Key; + use pageserver_api::shard::ShardIndex; + use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::task::JoinHandle; - - use crate::{ - controller_upcall_client::RetryForeverError, - tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, - }; + use tracing::info; use super::*; + use crate::controller_upcall_client::RetryForeverError; + use crate::tenant::harness::TenantHarness; + use crate::tenant::storage_layer::DeltaLayerName; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -724,26 +718,26 @@ mod test { .expect("Failed to join workers for previous deletion queue"); } - fn set_latest_generation(&self, gen: Generation) { + fn set_latest_generation(&self, gen_: Generation) { let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_shard_id, gen); + .insert(tenant_shard_id, gen_); } /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, file_name: LayerName, - gen: Generation, + gen_: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; - let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix()); let content: Vec = format!("placeholder contents of {file_name}").into(); @@ -1098,11 +1092,12 @@ mod test { /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. #[cfg(test)] pub(crate) mod mock { + use std::sync::atomic::{AtomicUsize, Ordering}; + use tracing::info; use super::*; use crate::tenant::remote_timeline_client::remote_layer_path; - use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index ef1dfbac19..691ba75cc7 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -6,21 +6,16 @@ //! number of full-sized DeleteObjects requests, rather than a larger number of //! smaller requests. -use remote_storage::GenericRemoteStorage; -use remote_storage::RemotePath; -use remote_storage::TimeoutOrCancel; use std::time::Duration; + +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use utils::backoff; -use utils::pausable_failpoint; +use tracing::{info, warn}; +use utils::{backoff, pausable_failpoint}; +use super::{DeletionQueueError, FlushOp}; use crate::metrics; -use super::DeletionQueueError; -use super::FlushOp; - const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); pub(super) enum DeleterMessage { diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index ae3b2c9180..a385e35a02 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -10,11 +10,6 @@ //! //! DeletionLists are passed onwards to the Validator. -use super::DeletionHeader; -use super::DeletionList; -use super::FlushOp; -use super::ValidatorQueueMessage; - use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; @@ -23,20 +18,17 @@ use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; +use tracing::{debug, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage}; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path}; use crate::tenant::storage_layer::LayerName; -use crate::virtual_file::on_fatal_io_error; -use crate::virtual_file::MaybeFatalIo; +use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error}; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 1d55581ebd..b0ce2b80b4 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -20,22 +20,14 @@ use std::time::Duration; use camino::Utf8PathBuf; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; - -use crate::config::PageServerConf; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::controller_upcall_client::RetryForeverError; -use crate::metrics; -use crate::virtual_file::MaybeFatalIo; +use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; -use super::DeletionHeader; -use super::DeletionList; -use super::DeletionQueueError; -use super::FlushOp; -use super::VisibleLsnUpdates; +use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; +use crate::config::PageServerConf; +use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError}; +use crate::metrics; +use crate::virtual_file::MaybeFatalIo; // After this length of time, do any validation work that is pending, // even if we haven't accumulated many keys to delete. @@ -190,7 +182,10 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!( + "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", + tenant_lsn_state.generation + ); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 738a783813..13252037e5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,30 +41,31 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; +use pageserver_api::config::DiskUsageEvictionTaskConfig; +use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{completion, id::TimelineId}; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::completion; +use utils::id::TimelineId; -use crate::{ - config::PageServerConf, - metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, BACKGROUND_RUNTIME}, - tenant::{ - mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, - secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, - tasks::sleep_random, - }, - CancellableTask, DiskUsageEvictionTask, +use crate::config::PageServerConf; +use crate::metrics::disk_usage_based_eviction::METRICS; +use crate::task_mgr::{self, BACKGROUND_RUNTIME}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::secondary::SecondaryTenant; +use crate::tenant::storage_layer::{ + AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint, }; +use crate::tenant::tasks::sleep_random; +use crate::{CancellableTask, DiskUsageEvictionTask}; /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. @@ -1007,10 +1008,14 @@ async fn collect_eviction_candidates( } } - debug_assert!(EvictionPartition::Above < EvictionPartition::Below, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!( + EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); + debug_assert!( + EvictionPartition::EvictNow < EvictionPartition::Above, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); eviction_order.sort(&mut candidates); @@ -1157,9 +1162,8 @@ mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; - use crate::statvfs::Statvfs; - use super::DiskUsageEvictionTaskConfig; + use crate::statvfs::Statvfs; #[derive(Debug, Clone, Copy)] pub struct Usage<'a> { @@ -1224,10 +1228,12 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::Usage as _; use std::time::Duration; + use utils::serde_percent::Percent; + use super::Usage as _; + let mut usage = Usage { config: &DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(85).unwrap(), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 56a84a98a8..dd5a24a41f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2,125 +2,83 @@ //! Management HTTP API //! use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use enumset::EnumSet; use futures::future::join_all; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::{StreamExt, TryFutureExt}; use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, }; +use http_utils::error::{ApiError, HttpErrorBody}; use http_utils::failpoints::failpoints_handler; -use http_utils::request::must_parse_query_param; -use http_utils::request::{get_request_param, must_get_query_param, parse_query_param}; +use http_utils::json::{json_request, json_request_maybe, json_response}; +use http_utils::request::{ + get_request_param, must_get_query_param, must_parse_query_param, parse_query_param, + parse_request_param, +}; +use http_utils::{RequestExt, RouterBuilder}; use humantime::format_rfc3339; -use hyper::header; -use hyper::StatusCode; -use hyper::{Body, Request, Response, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; -use pageserver_api::models::IngestAuxFilesRequest; -use pageserver_api::models::ListAuxFilesRequest; -use pageserver_api::models::LocationConfig; -use pageserver_api::models::LocationConfigListResponse; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::models::LsnLease; -use pageserver_api::models::LsnLeaseRequest; -use pageserver_api::models::OffloadedTimelineInfo; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::models::ShardParameters; -use pageserver_api::models::TenantConfigPatchRequest; -use pageserver_api::models::TenantDetails; -use pageserver_api::models::TenantLocationConfigRequest; -use pageserver_api::models::TenantLocationConfigResponse; -use pageserver_api::models::TenantScanRemoteStorageResponse; -use pageserver_api::models::TenantScanRemoteStorageShard; -use pageserver_api::models::TenantShardLocation; -use pageserver_api::models::TenantShardSplitRequest; -use pageserver_api::models::TenantShardSplitResponse; -use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; -use pageserver_api::models::TenantWaitLsnRequest; -use pageserver_api::models::TimelineArchivalConfigRequest; -use pageserver_api::models::TimelineCreateRequestMode; -use pageserver_api::models::TimelineCreateRequestModeImportPgdata; -use pageserver_api::models::TimelinesInfoAndOffloaded; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::TopTenantShardsRequest; -use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::shard::ShardCount; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeTravelError; +use pageserver_api::models::{ + DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, + LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, + OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, + TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, + TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, + TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, +}; +use pageserver_api::shard::{ShardCount, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; -use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tenant_size_model::svg::SvgBranchKind; +use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::SwappableJwtAuth; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::config::PageServerConf; -use crate::context::RequestContextBuilder; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; -use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, - TenantSlotUpsertError, TenantStateError, + GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, + TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, +}; +use crate::tenant::remote_timeline_client::{ + download_index_part, list_remote_tenant_shards, list_remote_timelines, }; -use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; -use crate::tenant::remote_timeline_client; -use crate::tenant::remote_timeline_client::download_index_part; -use crate::tenant::remote_timeline_client::list_remote_tenant_shards; -use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; -use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerName; -use crate::tenant::timeline::import_pgdata; -use crate::tenant::timeline::offload::offload_timeline; -use crate::tenant::timeline::offload::OffloadError; -use crate::tenant::timeline::CompactFlags; -use crate::tenant::timeline::CompactOptions; -use crate::tenant::timeline::CompactRequest; -use crate::tenant::timeline::CompactionError; -use crate::tenant::timeline::Timeline; -use crate::tenant::timeline::WaitLsnTimeout; -use crate::tenant::timeline::WaitLsnWaiter; -use crate::tenant::GetTimelineError; -use crate::tenant::OffloadedTimeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::DEFAULT_PG_VERSION; -use crate::{disk_usage_eviction_task, tenant}; -use http_utils::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, +use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; +use crate::tenant::timeline::{ + CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, + WaitLsnWaiter, import_pgdata, }; -use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, -}; -use utils::{ - auth::SwappableJwtAuth, - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, +use crate::tenant::{ + GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, + remote_timeline_client, }; +use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant}; // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of @@ -1128,12 +1086,12 @@ async fn tenant_list_handler( ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state, gen)| TenantInfo { + .map(|(id, state, gen_)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen) + generation: (*gen_) .into() .expect("Tenants are always attached with a generation"), gc_blocking: None, @@ -1670,9 +1628,8 @@ async fn block_or_unblock_gc( request: Request, block: bool, ) -> Result, ApiError> { - use crate::tenant::{ - remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, - }; + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2058,7 +2015,9 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + tracing::info!( + "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}" + ); remote_timeline_client::upload::time_travel_recover_tenant( &state.remote_storage, @@ -2396,7 +2355,8 @@ async fn timeline_checkpoint_handler( CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::Other(e) => ApiError::InternalServerError(e) + CompactionError::Other(e) => ApiError::InternalServerError(e), + CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -2458,9 +2418,10 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor; use pageserver_api::models::detach_ancestor::AncestorDetached; + use crate::tenant::timeline::detach_ancestor; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2805,14 +2766,19 @@ async fn tenant_scan_remote_handler( .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", - index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + tracing::info!( + "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), + index_part.metadata.disk_consistent_lsn() + ); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { // This is normal for tenants that were created with multiple shards: they have an unsharded path // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. - tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + tracing::info!( + "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping" + ); continue; } Err(e) => { @@ -3431,7 +3397,9 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow anyhow::bail!("unexpected non-zero bytes after the tar archive"); } if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + anyhow::bail!( + "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive" + ); } Ok(()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index a73fa5cec8..6dd005de50 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -4,14 +4,22 @@ //! use std::path::{Path, PathBuf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{ + BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName, + pg_constants, +}; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; +use utils::lsn::Lsn; use wal_decoder::models::InterpretedWalRecord; use walkdir::WalkDir; @@ -20,16 +28,6 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::ControlFileData; -use postgres_ffi::DBState_DB_SHUTDOWNED; -use postgres_ffi::Oid; -use postgres_ffi::XLogFileName; -use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 491c9fb96c..6cfecef0cf 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,4 +1,5 @@ -use std::{num::NonZeroUsize, sync::Arc}; +use std::num::NonZeroUsize; +use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f43cd08cf7..02767055fb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -15,7 +15,8 @@ pub mod l0_flush; extern crate hyper0 as hyper; -use futures::{stream::FuturesUnordered, StreamExt}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; @@ -35,10 +36,8 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::{ - mgr::{BackgroundPurges, TenantManager}, - secondary, -}; +use tenant::mgr::{BackgroundPurges, TenantManager}; +use tenant::secondary; use tracing::{info, info_span}; /// Current storage format version @@ -350,9 +349,10 @@ async fn timed_after_cancellation( #[cfg(test)] mod timed_tests { - use super::timed; use std::time::Duration; + use super::timed; + #[tokio::test] async fn timed_completes_when_inner_future_completes() { // A future that completes on time should have its result returned diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e1c26b0684..eb8a9b8e24 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -10,11 +10,11 @@ use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ + Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::config::{ @@ -24,9 +24,8 @@ use pageserver_api::config::{ use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use pin_project_lite::pin_project; -use postgres_backend::{is_expected_io_error, QueryError}; +use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; - use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; @@ -35,12 +34,12 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::throttle::ThrottleResult; -use crate::tenant::Timeline; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user @@ -363,7 +362,7 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; - use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; #[derive(Clone, Copy)] @@ -722,7 +721,7 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy = Lazy::new(|| { }); pub(crate) mod initial_logical_size { - use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec}; use once_cell::sync::Lazy; pub(crate) struct StartCalculation(IntCounterVec); @@ -1105,12 +1104,17 @@ impl EvictionsWithLowResidenceDuration { // - future "drop panick => abort" // // so just nag: (the error has the labels) - tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + tracing::warn!( + "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}" + ); } Ok(()) => { // to help identify cases where we double-remove the same values, let's log all // deletions? - tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + tracing::info!( + "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", + self.data_source + ); } } } @@ -3574,12 +3578,10 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub mod tokio_epoll_uring { - use std::{ - collections::HashMap, - sync::{Arc, Mutex}, - }; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; - use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge}; + use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter}; use once_cell::sync::Lazy; /// Shared storage for tokio-epoll-uring thread local metrics. @@ -3588,7 +3590,9 @@ pub mod tokio_epoll_uring { let slots_submission_queue_depth = register_histogram!( "pageserver_tokio_epoll_uring_slots_submission_queue_depth", "The slots waiters queue depth of each tokio_epoll_uring system", - vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], ) .expect("failed to define a metric"); ThreadLocalMetricsStorage { @@ -3765,7 +3769,7 @@ pub mod tokio_epoll_uring { } pub(crate) mod tenant_throttling { - use metrics::{register_int_counter_vec, IntCounter}; + use metrics::{IntCounter, register_int_counter_vec}; use once_cell::sync::Lazy; use utils::shard::TenantShardId; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 45bf02362a..984dd125a9 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -67,23 +67,18 @@ //! mapping is automatically removed and the slot is marked free. //! -use std::{ - collections::{hash_map::Entry, HashMap}, - sync::{ - atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, - Arc, Weak, - }, - time::Duration, -}; +use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::Duration; use anyhow::Context; use once_cell::sync::OnceCell; -use crate::{ - context::RequestContext, - metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - virtual_file::{IoBufferMut, IoPageSlice}, -}; +use crate::context::RequestContext; +use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics}; +use crate::virtual_file::{IoBufferMut, IoPageSlice}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -168,11 +163,7 @@ impl Slot { let count_res = self.usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { - if val == 0 { - None - } else { - Some(val - 1) - } + if val == 0 { None } else { Some(val - 1) } }); match count_res { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0c8da6f2a8..8972515163 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,7 +1,15 @@ //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -use anyhow::{bail, Context}; +use std::borrow::Cow; +use std::num::NonZeroUsize; +use std::os::fd::AsRawFd; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; +use std::{io, str}; + +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use futures::FutureExt; @@ -11,69 +19,57 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; -use pageserver_api::models::{self, TenantState}; +use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ - PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, PagestreamRequest, + PagestreamProtocolVersion, PagestreamRequest, TenantState, }; +use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; use postgres_backend::{ - is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, + AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; +use postgres_ffi::BLCKSZ; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; -use pq_proto::FeStartupPacket; -use pq_proto::{BeMessage, FeMessage, RowDescriptor}; -use std::borrow::Cow; -use std::io; -use std::num::NonZeroUsize; -use std::str; -use std::str::FromStr; -use std::sync::Arc; -use std::time::SystemTime; -use std::time::{Duration, Instant}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::io::{AsyncWriteExt, BufWriter}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; +use strum_macros::IntoStaticStr; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::{Claims, Scope, SwappableJwtAuth}; +use utils::failpoint_support; +use utils::id::{TenantId, TimelineId}; +use utils::logging::log_slow; +use utils::lsn::Lsn; +use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; -use utils::{ - auth::{Claims, Scope, SwappableJwtAuth}, - id::{TenantId, TimelineId}, - lsn::Lsn, - simple_rcu::RcuReadGuard, -}; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, SmgrOpTimer}; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; +use crate::metrics::{ + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, +}; use crate::pgdatadir_mapping::Version; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr::TaskKind; -use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; -use crate::tenant::mgr::ShardSelector; -use crate::tenant::mgr::TenantManager; -use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::span::{ + debug_assert_current_span_has_tenant_and_timeline_id, + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, +}; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; +use crate::tenant::mgr::{ + GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, +}; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; -use crate::tenant::GetTimelineError; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; +use crate::tenant::{GetTimelineError, PageReconstructError, Timeline}; use crate::{basebackup, timed_after_cancellation}; -use pageserver_api::key::rel_block_to_key; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -81,6 +77,9 @@ use std::os::fd::AsRawFd; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// Threshold at which to log slow GetPage requests. +const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); + /////////////////////////////////////////////////////////////////////////////// pub struct Listener { @@ -594,6 +593,7 @@ struct BatchedTestRequest { /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, /// so that we don't keep the [`Timeline::gate`] open while the batch /// is being built up inside the [`spsc_fold`] (pagestream pipelining). +#[derive(IntoStaticStr)] enum BatchedFeMessage { Exists { span: Span, @@ -638,6 +638,10 @@ enum BatchedFeMessage { } impl BatchedFeMessage { + fn as_static_str(&self) -> &'static str { + self.into() + } + fn observe_execution_start(&mut self, at: Instant) { match self { BatchedFeMessage::Exists { timer, .. } @@ -975,7 +979,7 @@ impl PageServerHandler { Ok(BatchedFeMessage::GetPage { span: _, shard: accum_shard, - pages: ref mut accum_pages, + pages: accum_pages, effective_request_lsn: accum_lsn, }), BatchedFeMessage::GetPage { @@ -1076,133 +1080,13 @@ impl PageServerHandler { batch }; - // invoke handler function - let (mut handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![self - .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::Nblocks { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![self - .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_get_page_at_lsn_request_batched( - &*shard.upgrade()?, - effective_request_lsn, - pages, - io_concurrency, - ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![self - .handle_db_size_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetSlruSegment { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![self - .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - #[cfg(feature = "testing")] - BatchedFeMessage::Test { - span, - shard, - requests, - } => { - fail::fail_point!("ps::handle-pagerequest-message::test"); - ( - { - let npages = requests.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } - }; + // Dispatch the batch to the appropriate request handler. + let (mut handler_results, span) = log_slow( + batch.as_static_str(), + LOG_SLOW_GETPAGE_THRESHOLD, + self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx), + ) + .await?; // We purposefully don't count flush time into the smgr operation timer. // @@ -1288,6 +1172,8 @@ impl PageServerHandler { &response_msg.serialize(protocol_version), ))?; + failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel); + // what we want to do let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); @@ -1320,6 +1206,149 @@ impl PageServerHandler { Ok(()) } + /// Helper which dispatches a batched message to the appropriate handler. + /// Returns a vec of results, along with the extracted trace span. + async fn pagestream_dispatch_batched_message( + &mut self, + batch: BatchedFeMessage, + io_concurrency: IoConcurrency, + ctx: &RequestContext, + ) -> Result< + ( + Vec>, + Span, + ), + QueryError, + > { + Ok(match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![ + self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![ + self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &*shard.upgrade()?, + effective_request_lsn, + pages, + io_concurrency, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![ + self.handle_db_size_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![ + self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }) + } + /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. @@ -1463,7 +1492,7 @@ impl PageServerHandler { } }; - let err = self + let result = self .pagesteam_handle_batched_message( pgb_writer, msg, @@ -1473,7 +1502,7 @@ impl PageServerHandler { ctx, ) .await; - match err { + match result { Ok(()) => {} Err(e) => break e, } @@ -2080,9 +2109,10 @@ impl PageServerHandler { set_tracing_field_shard_id(&timeline); if timeline.is_archived() == Some(true) { - // TODO after a grace period, turn this log line into a hard error - tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); - //return Err(QueryError::NotFound("timeline is archived".into())) + tracing::info!( + "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it." + ); + return Err(QueryError::NotFound("timeline is archived".into())); } let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index ae2762bd1e..787b1b895c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,6 +6,36 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::ops::{ControlFlow, Range}; + +use anyhow::{Context, ensure}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; +use pageserver_api::key::{ + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, + TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, + rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, + repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, +}; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::value::Value; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use strum::IntoEnumIterator; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, trace, warn}; +use utils::bin_ser::{BeSer, DeserializeError}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; + use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::RequestContext; @@ -19,37 +49,6 @@ use crate::span::{ }; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; -use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes, BytesMut}; -use enum_map::Enum; -use itertools::Itertools; -use pageserver_api::key::{ - dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, - slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, - twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, - CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, -}; -use pageserver_api::key::{rel_tag_sparse_key, Key}; -use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; -use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; -use std::ops::ControlFlow; -use std::ops::Range; -use strum::IntoEnumIterator; -use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; -use utils::bin_ser::DeserializeError; -use utils::pausable_failpoint; -use utils::{bin_ser::BeSer, lsn::Lsn}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -327,16 +326,16 @@ impl Timeline { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { - PageReconstructError::Cancelled => { - PageReconstructError::Cancelled - } + PageReconstructError::Cancelled => PageReconstructError::Cancelled, - x @ PageReconstructError::Other(_) | - x @ PageReconstructError::AncestorLsnTimeout(_) | - x @ PageReconstructError::WalRedo(_) | - x @ PageReconstructError::MissingKey(_) => { - PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}")) - }, + x @ PageReconstructError::Other(_) + | x @ PageReconstructError::AncestorLsnTimeout(_) + | x @ PageReconstructError::WalRedo(_) + | x @ PageReconstructError::MissingKey(_) => { + PageReconstructError::Other(anyhow::anyhow!( + "there was more than one request for this key in the batch, error logged once: {x:?}" + )) + } }), }; @@ -355,23 +354,23 @@ impl Timeline { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { - GetVectoredError::Cancelled => { - Err(PageReconstructError::Cancelled) - } + GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled), // TODO: restructure get_vectored API to make this error per-key GetVectoredError::MissingKey(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more of the requested keys were missing: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::GetReadyAncestorError(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key - GetVectoredError::Other(err) => { - Err(PageReconstructError::Other( - anyhow::anyhow!("whole vectored get request failed: {err:?}"), - )) - } + GetVectoredError::Other(err) => Err(PageReconstructError::Other( + anyhow::anyhow!("whole vectored get request failed: {err:?}"), + )), // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) @@ -379,10 +378,7 @@ impl Timeline { // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!( - "batching oversized: {err:?}" - ) - .into()) + Err(anyhow::anyhow!("batching oversized: {err:?}").into()) } }; @@ -715,7 +711,10 @@ impl Timeline { { Ok(res) => res, Err(PageReconstructError::MissingKey(e)) => { - warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e); + warn!( + "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", + e + ); // Return that we didn't find any requests smaller than the LSN, and logging the error. return Ok(LsnForTimestamp::Past(min_lsn)); } @@ -2264,6 +2263,13 @@ impl DatadirModification<'_> { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } + // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit. + // Compute doesn't know if previous version of this file exists or not, so + // attempt to delete non-existing file can cause this message. + // To avoid false alarms, log it as info rather than warning. + (None, true) if path.starts_with("pg_stat/") => { + info!("removing non-existing pg_stat file: {}", path) + } (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files)?; @@ -2457,10 +2463,12 @@ impl DatadirModification<'_> { // modifications before ingesting DB create operations, which are the only kind that reads // data pages during ingest. if cfg!(debug_assertions) { - assert!(!self - .pending_data_batch - .as_ref() - .is_some_and(|b| b.updates_key(&key))); + assert!( + !self + .pending_data_batch + .as_ref() + .is_some_and(|b| b.updates_key(&key)) + ); } } @@ -2659,15 +2667,14 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[cfg(test)] mod tests { use hex_literal::hex; - use pageserver_api::{models::ShardParameters, shard::ShardStripeSize}; - use utils::{ - id::TimelineId, - shard::{ShardCount, ShardNumber}, - }; + use pageserver_api::models::ShardParameters; + use pageserver_api::shard::ShardStripeSize; + use utils::id::TimelineId; + use utils::shard::{ShardCount, ShardNumber}; use super::*; - - use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::TenantHarness; /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline #[tokio::test] diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 4e8be58d58..85c2ed8499 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -73,11 +73,10 @@ impl Statvfs { pub mod mock { use camino::Utf8Path; + pub use pageserver_api::config::statvfs::mock::Behavior; use regex::Regex; use tracing::log::info; - pub use pageserver_api::config::statvfs::mock::Behavior; - pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -85,7 +84,7 @@ pub mod mock { Behavior::Success { blocksize, total_blocks, - ref name_filter, + name_filter, } => { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); @@ -134,7 +133,7 @@ pub mod mock { } Err(e) => { return Err(anyhow::Error::new(e) - .context(format!("get metadata of {:?}", entry.path()))) + .context(format!("get metadata of {:?}", entry.path()))); } }; total += m.len(); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index cc93a06ccd..0b71b2cf5b 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -40,15 +40,12 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; - use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - use utils::env; use utils::id::TimelineId; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index efb35625f2..9243f131ad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,149 +12,99 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, Context}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Debug, Display}; +use std::fs::File; +use std::future::Future; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant, SystemTime}; +use std::{fmt, fs}; + +use anyhow::{Context, bail}; use arc_swap::ArcSwap; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::NaiveDateTime; use enumset::EnumSet; -use futures::stream::FuturesUnordered; use futures::StreamExt; +use futures::stream::FuturesUnordered; use itertools::Itertools as _; +use once_cell::sync::Lazy; use pageserver_api::models; -use pageserver_api::models::CompactInfoResponse; -use pageserver_api::models::LsnLease; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::models::TimelineState; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::WalRedoManagerStatus; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeoutOrCancel; -use remote_timeline_client::manifest::{ - OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, +pub use pageserver_api::models::TenantState; +use pageserver_api::models::{ + CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + WalRedoManagerStatus, }; -use remote_timeline_client::UploadQueueNotReadyError; -use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; -use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; -use secondary::heatmap::HeatMapTenant; -use secondary::heatmap::HeatMapTimeline; -use std::collections::BTreeMap; -use std::fmt; -use std::future::Future; -use std::sync::atomic::AtomicBool; -use std::sync::Weak; -use std::time::SystemTime; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; +use remote_timeline_client::index::GcCompactionState; +use remote_timeline_client::manifest::{ + LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest, +}; +use remote_timeline_client::{ + FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, +}; +use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; -use timeline::compaction::CompactionOutcome; -use timeline::compaction::GcCompactionQueue; -use timeline::import_pgdata; -use timeline::offload::offload_timeline; -use timeline::offload::OffloadError; -use timeline::CompactFlags; -use timeline::CompactOptions; -use timeline::CompactionError; -use timeline::PreviousHeatmap; -use timeline::ShutdownMode; +use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::offload::{OffloadError, offload_timeline}; +use timeline::{ + CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, +}; use tokio::io::BufReader; -use tokio::sync::watch; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore, watch}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use upload_queue::NotInitialized; -use utils::backoff; use utils::circuit_breaker::CircuitBreaker; -use utils::completion; use utils::crashsafe::path_with_suffix_extension; -use utils::failpoint_support; -use utils::fs_ext; -use utils::pausable_failpoint; -use utils::sync::gate::Gate; -use utils::sync::gate::GateGuard; -use utils::timeout::timeout_cancellable; -use utils::timeout::TimeoutCancellableError; +use utils::sync::gate::{Gate, GateGuard}; +use utils::timeout::{TimeoutCancellableError, timeout_cancellable}; use utils::try_rcu::ArcSwapExt; -use utils::zstd::create_zst_tarball; -use utils::zstd::extract_zst_tarball; +use utils::zstd::{create_zst_tarball, extract_zst_tarball}; +use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; -use self::config::AttachedLocationConfig; -use self::config::AttachmentMode; -use self::config::LocationConf; -use self::config::TenantConf; +use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf}; use self::metadata::TimelineMetadata; -use self::mgr::GetActiveTenantError; -use self::mgr::GetTenantError; +use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; -use self::timeline::uninit::TimelineCreateGuard; -use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::UninitializedTimeline; -use self::timeline::EvictionTaskTenantState; -use self::timeline::GcCutoffs; -use self::timeline::TimelineDeleteProgress; -use self::timeline::TimelineResources; -use self::timeline::WaitLsnError; +use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline}; +use self::timeline::{ + EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, +}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::deletion_queue::DeletionQueueClient; -use crate::deletion_queue::DeletionQueueError; -use crate::import_datadir; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; -use crate::metrics::CONCURRENT_INITDBS; -use crate::metrics::INITDB_RUN_TIME; -use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME; -use crate::metrics::TENANT; use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, - TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, + BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, + TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::config::LocationMode; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationMode, TenantConfOpt}; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::remote_initdb_archive_path; -use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::INITDB_PATH; -use crate::tenant::storage_layer::DeltaLayer; -use crate::tenant::storage_layer::ImageLayer; -use crate::walingest::WalLagCooldown; -use crate::walredo; -use crate::InitializationOrder; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; -use std::fs; -use std::fs::File; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::{Duration, Instant}; - -use crate::span; +use crate::tenant::remote_timeline_client::{ + INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, +}; +use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; +use crate::walingest::WalLagCooldown; use crate::walredo::PostgresRedoManager; -use crate::TEMP_FILE_SUFFIX; -use once_cell::sync::Lazy; -pub use pageserver_api::models::TenantState; -use tokio::sync::Semaphore; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use utils::{ - crashsafe, - generation::Generation, - id::TimelineId, - lsn::{Lsn, RecordLsn}, -}; +use utils::crashsafe; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; pub mod block_io; @@ -183,9 +133,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; -pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -250,7 +200,9 @@ impl AttachedTenantConf { Ok(Self::new(location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { - anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") + anyhow::bail!( + "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode" + ) } } } @@ -464,7 +416,9 @@ impl WalredoManagerId { static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { - panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + panic!( + "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique" + ); } Self(id) } @@ -1168,6 +1122,7 @@ impl Tenant { resources, CreateTimelineCause::Load, idempotency.clone(), + index_part.gc_compaction.clone(), )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1189,12 +1144,47 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; + // When unarchiving, we've mostly likely lost the heatmap generated prior + // to the archival operation. To allow warming this timeline up, generate + // a previous heatmap which contains all visible layers in the layer map. + // This previous heatmap will be used whenever a fresh heatmap is generated + // for the timeline. + if matches!(cause, LoadTimelineCause::Unoffload) { + let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); + while let Some((tline, end_lsn)) = tline_ending_at { + let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; + if !tline.is_previous_heatmap_active() { + tline + .previous_heatmap + .store(Some(Arc::new(unarchival_heatmap))); + } else { + tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + } + + match tline.ancestor_timeline() { + Some(ancestor) => { + if ancestor.update_layer_visibility().await.is_err() { + // Ancestor timeline is shutting down. + break; + } + + tline_ending_at = Some((ancestor, tline.get_ancestor_lsn())); + } + None => { + tline_ending_at = None; + } + } + } + } + match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { match cause { LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), LoadTimelineCause::ImportPgdata { .. } => { - unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + unreachable!( + "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" + ) } } let mut guard = self.timelines_creating.lock().unwrap(); @@ -1227,8 +1217,8 @@ impl Tenant { // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); } Entry::Vacant(v) => { v.insert(Arc::clone(&timeline)); @@ -1622,7 +1612,9 @@ impl Tenant { failpoint_support::sleep_millis_async!("before-attaching-tenant"); let Some(preload) = preload else { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); + anyhow::bail!( + "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624" + ); }; let mut offloaded_timeline_ids = HashSet::new(); @@ -2006,7 +1998,7 @@ impl Tenant { remote_storage: GenericRemoteStorage, previous_heatmap: Option, cancel: CancellationToken, - ) -> impl Future { + ) -> impl Future + use<> { let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -2701,7 +2693,9 @@ impl Tenant { timeline } CreateTimelineResult::ImportSpawned(timeline) => { - info!("import task spawned, timeline will become visible and activated once the import is done"); + info!( + "import task spawned, timeline will become visible and activated once the import is done" + ); timeline } }; @@ -2747,7 +2741,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -2881,7 +2875,9 @@ impl Tenant { let index_part = match index_part { MaybeDeletedIndexPart::Deleted(_) => { // likely concurrent delete call, cplane should prevent this - anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + anyhow::bail!( + "index part says deleted but we are not done creating yet, this should not happen but" + ) } MaybeDeletedIndexPart::IndexPart(p) => p, }; @@ -3092,20 +3088,19 @@ impl Tenant { // If we're done compacting, check the scheduled GC compaction queue for more work. if outcome == CompactionOutcome::Done { - let queue = self - .scheduled_compaction_tasks - .lock() - .unwrap() - .get(&timeline.timeline_id) - .cloned(); - if let Some(queue) = queue { - outcome = queue - .iteration(cancel, ctx, &self.gc_block, &timeline) - .instrument( - info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), - ) - .await?; - } + let queue = { + let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .entry(timeline.timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())) + .clone() + }; + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) + .await?; } // If we're done compacting, offload the timeline if requested. @@ -3146,11 +3141,13 @@ impl Tenant { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { match err { + err if err.is_cancel() => {} CompactionError::ShuttingDown => (), // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { + // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. self.compaction_circuit_breaker .lock() .unwrap() @@ -3162,6 +3159,7 @@ impl Tenant { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } + CompactionError::AlreadyRunning(_) => {} } } @@ -3872,7 +3870,9 @@ where if !later.is_empty() { for (missing_id, orphan_ids) in later { for (orphan_id, _) in orphan_ids { - error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + error!( + "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded" + ); } } bail!("could not load tenant because some timelines are missing ancestors"); @@ -4117,6 +4117,7 @@ impl Tenant { resources: TimelineResources, cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, + gc_compaction_state: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -4148,6 +4149,7 @@ impl Tenant { state, self.attach_wal_lag_cooldown.clone(), create_idempotency, + gc_compaction_state, self.cancel.child_token(), ); @@ -4790,7 +4792,10 @@ impl Tenant { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { - tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn); + tracing::info!( + "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", + *applied_gc_cutoff_lsn + ); } else { src_timeline .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) @@ -4936,7 +4941,9 @@ impl Tenant { } // Idempotent <=> CreateTimelineIdempotency is identical (x, y) if x == y => { - info!("timeline already exists and idempotency matches, succeeding request"); + info!( + "timeline already exists and idempotency matches, succeeding request" + ); // fallthrough } (_, _) => { @@ -5018,7 +5025,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -5213,6 +5220,7 @@ impl Tenant { resources, CreateTimelineCause::Load, create_guard.idempotency.clone(), + None, ) .context("Failed to create timeline data structure")?; @@ -5222,7 +5230,9 @@ impl Tenant { .create_timeline_files(&create_guard.timeline_path) .await { - error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!( + "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}" + ); cleanup_timeline_directory(create_guard); return Err(e); } @@ -5587,20 +5597,19 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; + use hex_literal::hex; use once_cell::sync::OnceCell; + use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; + use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::ShardIndex; + use utils::id::TenantId; use utils::logging; + use super::*; use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; - use pageserver_api::key::Key; - use pageserver_api::record::NeonWalRecord; - - use super::*; - use hex_literal::hex; - use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -5881,34 +5890,34 @@ pub(crate) mod harness { mod tests { use std::collections::{BTreeMap, BTreeSet}; - use super::*; - use crate::keyspace::KeySpaceAccum; - use crate::tenant::harness::*; - use crate::tenant::timeline::CompactFlags; - use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + #[cfg(feature = "testing")] + use models::CompactLsnRange; + use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; - use rand::{thread_rng, Rng}; + use rand::{Rng, thread_rng}; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + #[cfg(feature = "testing")] + use timeline::GcInfo; + #[cfg(feature = "testing")] + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; - #[cfg(feature = "testing")] - use models::CompactLsnRange; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; - #[cfg(feature = "testing")] - use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - #[cfg(feature = "testing")] - use timeline::GcInfo; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::keyspace::KeySpaceAccum; + use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -6158,11 +6167,12 @@ mod tests { panic!("wrong error type") }; assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + assert!( + err.source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data") + ) } } @@ -6191,11 +6201,12 @@ mod tests { panic!("wrong error type"); }; assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC cutoff")); + assert!( + &err.source() + .unwrap() + .to_string() + .contains("is earlier than latest GC cutoff") + ); } } @@ -7504,10 +7515,12 @@ mod tests { } } - assert!(!harness - .conf - .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) - .exists()); + assert!( + !harness + .conf + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) + .exists() + ); Ok(()) } @@ -7708,7 +7721,10 @@ mod tests { let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + assert!( + after_num_l0_delta_files < before_num_l0_delta_files, + "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}" + ); assert_eq!( tline.get(test_key, lsn, &ctx).await?, @@ -7875,7 +7891,10 @@ mod tests { let (_, after_delta_file_accessed) = scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) .await?; - assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + assert!( + after_delta_file_accessed < before_delta_file_accessed, + "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}" + ); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( after_delta_file_accessed <= 2, @@ -7929,10 +7948,12 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); - assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) - .await - .unwrap_err() - .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); assert!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) .await diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 7b55df52a5..b16a88eaa4 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -14,6 +14,9 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use std::cmp::min; +use std::io::{Error, ErrorKind}; + use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; @@ -24,10 +27,8 @@ use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; -use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; -use std::cmp::min; -use std::io::{Error, ErrorKind}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; #[derive(Copy, Clone, Debug)] pub struct CompressionInfo { @@ -414,12 +415,15 @@ impl BlobWriter { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::block_io::BlockReaderRef; + async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { round_trip_test_compressed::(blobs, false).await } @@ -486,7 +490,7 @@ pub(crate) mod tests { pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); - (0..len).map(|_| rng.gen()).collect::<_>() + (0..len).map(|_| rng.r#gen()).collect::<_>() } #[tokio::test] @@ -544,9 +548,9 @@ pub(crate) mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { - let mut sz: u16 = rng.gen(); + let mut sz: u16 = rng.r#gen(); // Make 50% of the arrays small - if rng.gen() { + if rng.r#gen() { sz &= 63; } random_array(sz.into()) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 990211f80a..66c586daff 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,14 +2,16 @@ //! Low-level Block-oriented I/O functions //! +use std::ops::Deref; + +use bytes::Bytes; + use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult}; #[cfg(test)] use crate::virtual_file::IoBufferMut; use crate::virtual_file::VirtualFile; -use bytes::Bytes; -use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index f98356242e..d5b979ab2a 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -63,9 +63,9 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", - layer, other_layer - ); + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); return Some(err); } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c6bcfdf2fb..334fb04604 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,16 +8,17 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use std::num::NonZeroU64; +use std::time::Duration; + pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::CompactionAlgorithmSettings; -use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, TenantConfigPatch}; +use pageserver_api::models::{ + self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch, +}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::num::NonZeroU64; -use std::time::Duration; use utils::generation::Generation; use utils::postgres_client::PostgresClientProtocol; @@ -693,16 +694,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { + // TODO(vlad): These are now the same, but they have different serialization logic. + // Can we merge them? fn from(value: TenantConfOpt) -> Self { - fn humantime(d: Duration) -> String { - format!("{}s", d.as_secs()) - } Self { checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout.map(humantime), + checkpoint_timeout: value.checkpoint_timeout, compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period.map(humantime), + compaction_period: value.compaction_period, compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, compaction_l0_first: value.compaction_l0_first, @@ -711,24 +711,23 @@ impl From for models::TenantConfig { l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, - gc_period: value.gc_period.map(humantime), + gc_period: value.gc_period, image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval.map(humantime), - walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), - lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + pitr_interval: value.pitr_interval, + walreceiver_connect_timeout: value.walreceiver_connect_timeout, + lagging_wal_timeout: value.lagging_wal_timeout, max_lsn_wal_lag: value.max_lsn_wal_lag, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold - .map(humantime), - heatmap_period: value.heatmap_period.map(humantime), + .evictions_low_residence_duration_metric_threshold, + heatmap_period: value.heatmap_period, lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, image_creation_preempt_threshold: value.image_creation_preempt_threshold, - lsn_lease_length: value.lsn_lease_length.map(humantime), - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + lsn_lease_length: value.lsn_lease_length, + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, rel_size_v2_enabled: value.rel_size_v2_enabled, @@ -741,9 +740,10 @@ impl From for models::TenantConfig { #[cfg(test)] mod tests { - use super::*; use models::TenantConfig; + use super::*; + #[test] fn de_serializing_pageserver_config_omits_empty_values() { let small_conf = TenantConfOpt { @@ -760,29 +760,10 @@ mod tests { assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - #[test] - fn test_try_from_models_tenant_config_err() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5a".to_string()), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); - - assert!( - tenant_conf_opt.is_err(), - "Suceeded to convert TenantConfig to TenantConfOpt" - ); - - let expected_error_str = - "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; - assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); - } - #[test] fn test_try_from_models_tenant_config_success() { let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5s".to_string()), + lagging_wal_timeout: Some(Duration::from_secs(5)), ..TenantConfig::default() }; diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index bb9df020b5..73c105b34e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,27 +18,23 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use std::cmp::Ordering; +use std::iter::Rev; +use std::ops::{Range, RangeInclusive}; +use std::{io, result}; + use async_stream::try_stream; -use byteorder::{ReadBytesExt, BE}; +use byteorder::{BE, ReadBytesExt}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; use futures::{Stream, StreamExt}; use hex; -use std::{ - cmp::Ordering, - io, - iter::Rev, - ops::{Range, RangeInclusive}, - result, -}; use thiserror::Error; use tracing::error; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::block_io::{BlockReader, BlockWriter}, -}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; @@ -833,12 +829,14 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; - use rand::Rng; use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; + use rand::Rng; + + use super::*; + use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; + #[derive(Clone, Default)] pub(crate) struct TestDisk { blocks: Vec, @@ -1115,7 +1113,7 @@ pub(crate) mod tests { // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { - let key_int = rand::thread_rng().gen::(); + let key_int = rand::thread_rng().r#gen::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index ba79672bc7..cb25fa6185 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,6 +1,17 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use std::io; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; + +use camino::Utf8PathBuf; +use num_traits::Num; +use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; +use utils::id::TimelineId; + use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; @@ -9,17 +20,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use camino::Utf8PathBuf; -use num_traits::Num; -use pageserver_api::shard::TenantShardId; -use tokio_epoll_uring::{BoundedBuf, Slice}; -use tracing::error; - -use std::io; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use utils::id::TimelineId; +use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io}; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, @@ -319,13 +320,14 @@ pub fn is_ephemeral_file(filename: &str) -> bool { #[cfg(test)] mod tests { + use std::fs; + use std::str::FromStr; + use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use std::fs; - use std::str::FromStr; fn harness( test_name: &str, diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index af73acb2be..7aa920c953 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TimelineId; diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs index c805aafeab..7a7d6d19cb 100644 --- a/pageserver/src/tenant/gc_result.rs +++ b/pageserver/src/tenant/gc_result.rs @@ -1,8 +1,9 @@ -use anyhow::Result; -use serde::Serialize; use std::ops::AddAssign; use std::time::Duration; +use anyhow::Result; +use serde::Serialize; + /// /// Result of performing GC /// diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index a69cce932e..59f5a6bd90 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -46,24 +46,24 @@ mod historic_layer_coverage; mod layer_coverage; -use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; -use anyhow::Result; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; -use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; + +use anyhow::Result; +use historic_layer_coverage::BufferedHistoricLayerCoverage; +pub use historic_layer_coverage::LayerKey; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use tokio::sync::watch; use utils::lsn::Lsn; -use historic_layer_coverage::BufferedHistoricLayerCoverage; -pub use historic_layer_coverage::LayerKey; - use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; +use crate::context::RequestContext; +use crate::keyspace::KeyPartitioning; +use crate::tenant::storage_layer::InMemoryLayer; /// /// LayerMap tracks what layers exist on a timeline. @@ -1066,18 +1066,17 @@ impl LayerMap { #[cfg(test)] mod tests { - use crate::tenant::{storage_layer::LayerName, IndexPart}; - use pageserver_api::{ - key::DBDIR_KEY, - keyspace::{KeySpace, KeySpaceRandomAccum}, - }; - use std::{collections::HashMap, path::PathBuf}; - use utils::{ - id::{TenantId, TimelineId}, - shard::TenantShardId, - }; + use std::collections::HashMap; + use std::path::PathBuf; + + use pageserver_api::key::DBDIR_KEY; + use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; + use utils::id::{TenantId, TimelineId}; + use utils::shard::TenantShardId; use super::*; + use crate::tenant::IndexPart; + use crate::tenant::storage_layer::LayerName; #[derive(Clone)] struct LayerDesc { @@ -1417,9 +1416,11 @@ mod tests { assert!(!shadow.ranges.is_empty()); // At least some layers should be marked covered - assert!(layer_visibilities - .iter() - .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + assert!( + layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered)) + ); let layer_visibilities = layer_visibilities.into_iter().collect::>(); diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 136f68bc36..f8bec48886 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -3,9 +3,8 @@ use std::ops::Range; use tracing::info; -use crate::tenant::storage_layer::PersistentLayerDesc; - use super::layer_coverage::LayerCoverageTuple; +use crate::tenant::storage_layer::PersistentLayerDesc; /// Layers in this module are identified and indexed by this data. /// diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 15c6955260..77f9a3579d 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -19,8 +19,9 @@ use anyhow::ensure; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; +use utils::bin_ser::{BeSer, SerializeError}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -345,9 +346,10 @@ impl TimelineMetadata { } pub(crate) mod modern_serde { - use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; use serde::{Deserialize, Serialize}; + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 22ee560dbf..003f84e640 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,34 +1,42 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, -}; -use pageserver_api::upcall_api::ReAttachResponseTenant; -use rand::{distributions::Alphanumeric, Rng}; -use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use sysinfo::SystemExt; -use tokio::fs; use anyhow::Context; +use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; +use rand::Rng; +use rand::distributions::Alphanumeric; +use remote_storage::TimeoutOrCancel; +use sysinfo::SystemExt; +use tokio::fs; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; - +use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext::PathExt; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, completion, crashsafe}; +use super::remote_timeline_client::remote_tenant_path; +use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ @@ -37,7 +45,7 @@ use crate::controller_upcall_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -48,16 +56,6 @@ use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Ten use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; -use utils::crashsafe::path_with_suffix_extension; -use utils::fs_ext::PathExt; -use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; - -use super::remote_timeline_client::remote_tenant_path; -use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; -use super::{GlobalShutDown, TenantSharedResources}; - /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service /// reads and ingest WAL. @@ -140,7 +138,7 @@ impl TenantStartupMode { /// If this returns None, the re-attach struct is in an invalid state and /// should be ignored in the response. fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { - match (rart.mode, rart.gen) { + match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), (LocationConfigMode::AttachedMulti, Some(g)) => { @@ -376,7 +374,7 @@ async fn init_load_generations( TenantStartupMode::Attached((_mode, generation)) => Some(generation), TenantStartupMode::Secondary => None, } - .map(|gen| (*id, *gen)) + .map(|gen_| (*id, *gen_)) }) .collect(); resources.deletion_queue_client.recover(attached_tenants)?; @@ -502,7 +500,9 @@ pub async fn init_tenant_mgr( .total_memory(); let max_ephemeral_layer_bytes = conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); - tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + tracing::info!( + "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory" + ); inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( max_ephemeral_layer_bytes, std::sync::atomic::Ordering::Relaxed, @@ -700,10 +700,11 @@ fn tenant_spawn( // to avoid impacting prod runtime performance. assert!(!crate::is_temporary(tenant_path)); debug_assert!(tenant_path.is_dir()); - debug_assert!(conf - .tenant_location_config_path(&tenant_shard_id) - .try_exists() - .unwrap()); + debug_assert!( + conf.tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap() + ); Tenant::spawn( conf, @@ -791,7 +792,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { - error!("already shutting down, this function isn't supposed to be called more than once"); + error!( + "already shutting down, this function isn't supposed to be called more than once" + ); return; } } @@ -1016,9 +1019,9 @@ impl TenantManager { Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( - timeout_ms = flush_timeout.as_millis(), - "Timed out waiting for flush to remote storage, proceeding anyway." - ) + timeout_ms = flush_timeout.as_millis(), + "Timed out waiting for flush to remote storage, proceeding anyway." + ) } } } @@ -1194,7 +1197,9 @@ impl TenantManager { } TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); - info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + info!( + "Shutting down just-spawned tenant, because tenant manager is shut down" + ); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); @@ -1784,7 +1789,7 @@ impl TenantManager { _ => { return Err(anyhow::anyhow!(e).context(format!( "Hard linking {relative_layer} into {child_prefix}" - ))) + ))); } } } @@ -2025,8 +2030,8 @@ impl TenantManager { .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(|e| { - use pageserver_api::models::TenantState; use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + use pageserver_api::models::TenantState; match e { Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { Error::ShuttingDown @@ -2089,7 +2094,7 @@ impl TenantManager { match selector { ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return ShardResolveResult::Found(tenant.clone()) + return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { // First slot we see for this tenant, calculate the expected shard number @@ -2486,7 +2491,7 @@ impl SlotGuard { TenantsMap::Initializing => { return Err(TenantSlotUpsertError::MapState( TenantMapError::StillInitializing, - )) + )); } TenantsMap::ShuttingDown(_) => { return Err(TenantSlotUpsertError::ShuttingDown(( @@ -2815,21 +2820,22 @@ where } } -use { - crate::tenant::gc_result::GcResult, http_utils::error::ApiError, - pageserver_api::models::TimelineGcRequest, -}; +use http_utils::error::ApiError; +use pageserver_api::models::TimelineGcRequest; + +use crate::tenant::gc_result::GcResult; #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use tracing::Instrument; + use super::super::harness::TenantHarness; + use super::TenantsMap; use crate::tenant::mgr::TenantSlot; - use super::{super::harness::TenantHarness, TenantsMap}; - #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 713efbb9a4..4ba5844fea 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -179,77 +179,64 @@ pub mod index; pub mod manifest; pub(crate) mod upload; -use anyhow::Context; -use camino::Utf8Path; -use chrono::{NaiveDateTime, Utc}; - -pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::shard::{ShardIndex, TenantShardId}; -use regex::Regex; -use scopeguard::ScopeGuard; -use tokio_util::sync::CancellationToken; -use utils::backoff::{ - self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::pausable_failpoint; -use utils::shard::ShardNumber; - use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; +use anyhow::Context; +use camino::Utf8Path; +use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::{ + download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, + list_remote_tenant_shards, list_remote_timelines, +}; +use index::GcCompactionState; +pub(crate) use index::LayerFileMetadata; +use pageserver_api::models::TimelineArchivalState; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use regex::Regex; use remote_storage::{ DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, }; -use std::ops::DerefMut; -use tracing::{debug, error, info, instrument, warn}; -use tracing::{info_span, Instrument}; -use utils::lsn::Lsn; - -use crate::context::RequestContext; -use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::metrics::{ - MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, - REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +pub(crate) use upload::upload_initdb_dir; +use utils::backoff::{ + self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; -use crate::task_mgr::shutdown_token; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::remote_timeline_client::download::download_retry; -use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; -use crate::tenant::TIMELINES_SEGMENT_NAME; -use crate::{ - config::PageServerConf, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::TimelineMetadata, - tenant::upload_queue::{ - UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, - }, - TENANT_HEATMAP_BASENAME, -}; - use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use utils::shard::ShardNumber; use self::index::IndexPart; - use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; - -pub(crate) use download::{ - download_index_part, download_tenant_manifest, is_temp_download_file, - list_remote_tenant_shards, list_remote_timelines, +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; +use crate::metrics::{ + MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + RemoteTimelineClientMetricsCallTrackSize, }; -pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::upload_initdb_dir; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::download::download_retry; +use crate::tenant::storage_layer::AsLayerDesc; +use crate::tenant::upload_queue::{ + Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, + UploadQueueStoppedDeletable, UploadTask, +}; +use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id}; +use crate::{TENANT_HEATMAP_BASENAME, task_mgr}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -913,6 +900,18 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( + self: &Arc, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.gc_compaction = Some(gc_compaction_state); + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1078,7 +1077,11 @@ impl RemoteTimelineClient { if !wanted(x) && wanted(y) { // this could be avoided by having external in-memory synchronization, like // timeline detach ancestor - warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + warn!( + ?reason, + op = "insert", + "unexpected: two racing processes to enable and disable a gc blocking reason" + ); } // at this point, the metadata must always show that there is a parent @@ -1132,7 +1135,11 @@ impl RemoteTimelineClient { (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), (x, y) => { if !wanted(x) && wanted(y) { - warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + warn!( + ?reason, + op = "remove", + "unexpected: two racing processes to enable and disable a gc blocking reason (remove)" + ); } upload_queue.dirty.gc_blocking = @@ -1274,12 +1281,14 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, metadata) in &with_metadata { - let gen = metadata.generation; - if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) { - if unexpected == gen { + let gen_ = metadata.generation; + if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) { + if unexpected == gen_ { tracing::error!("{name} was unlinked twice with same generation"); } else { - tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}"); + tracing::error!( + "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}" + ); } } } @@ -1341,11 +1350,11 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, meta) in &with_metadata { - let gen = meta.generation; + let gen_ = meta.generation; match upload_queue.dangling_files.remove(name) { - Some(same) if same == gen => { /* expected */ } + Some(same) if same == gen_ => { /* expected */ } Some(other) => { - tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}"); + tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}"); } None => { tracing::error!("{name} was unlinked but was not dangling"); @@ -1442,7 +1451,9 @@ impl RemoteTimelineClient { // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. let sg = scopeguard::guard((), |_| { - tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error") + tracing::error!( + "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error" + ) }); let fut = { @@ -1458,7 +1469,7 @@ impl RemoteTimelineClient { scopeguard::ScopeGuard::into_inner(sg); return; } - UploadQueue::Initialized(ref mut init) => init, + UploadQueue::Initialized(init) => init, }; // if the queue is already stuck due to a shutdown operation which was cancelled, then @@ -1818,7 +1829,9 @@ impl RemoteTimelineClient { .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) + .filter_map(|o| { + parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_)) + }) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -2010,7 +2023,7 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + UploadOp::UploadLayer(layer, layer_metadata, mode) => { // TODO: check if this mechanism can be removed now that can_bypass() performs // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { @@ -2100,7 +2113,7 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata { ref uploaded } => { + UploadOp::UploadMetadata { uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, @@ -2216,11 +2229,11 @@ impl RemoteTimelineClient { let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { - UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(_stopped) => { - None - }, - UploadQueue::Initialized(qi) => { Some(qi) } + UploadQueue::Uninitialized => panic!( + "callers are responsible for ensuring this is only called on an initialized queue" + ), + UploadQueue::Stopped(_stopped) => None, + UploadQueue::Initialized(qi) => Some(qi), }; let upload_queue = match upload_queue { @@ -2242,7 +2255,11 @@ impl RemoteTimelineClient { let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); let monotone = is_later || last_updater.is_none(); - assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + assert!( + monotone, + "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", + task.task_id + ); // not taking ownership is wasteful upload_queue.clean.0.clone_from(uploaded); @@ -2641,20 +2658,16 @@ pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option #[cfg(test)] mod tests { - use super::*; - use crate::{ - context::RequestContext, - tenant::{ - config::AttachmentMode, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::layer::local_layer_path, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; - use std::collections::HashSet; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::AttachmentMode; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::{Tenant, Timeline}; + pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b4d45dca75..92be2145ce 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,41 +8,39 @@ use std::future::Future; use std::str::FromStr; use std::time::SystemTime; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::backoff; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; +use utils::{backoff, pausable_failpoint}; +use super::index::{IndexPart, LayerFileMetadata}; +use super::manifest::TenantManifest; +use super::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, + parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, remote_tenant_manifest_path, + remote_tenant_manifest_prefix, remote_tenant_path, +}; +use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; +use crate::tenant::Generation; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; -use crate::tenant::Generation; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use remote_storage::{ - DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, -}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; -use utils::pausable_failpoint; - -use super::index::{IndexPart, LayerFileMetadata}; -use super::manifest::TenantManifest; -use super::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, - remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, - remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, INITDB_PATH, -}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that @@ -207,9 +205,9 @@ async fn download_object( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io; - use crate::virtual_file::IoBufferMut; use std::sync::Arc; + + use crate::virtual_file::{IoBufferMut, owned_buffers_io}; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b8b18005fd..ceaed58bbd 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,16 +7,16 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; +use utils::id::TimelineId; +use utils::lsn::Lsn; use super::is_same_remote_layer_path; +use crate::tenant::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; -use crate::tenant::Generation; -use pageserver_api::shard::ShardIndex; -use utils::id::TimelineId; -use utils::lsn::Lsn; /// In-memory representation of an `index_part.json` file /// @@ -85,9 +85,36 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, - /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated. + /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field. #[serde(skip_serializing_if = "Option::is_none", default)] - pub(crate) l2_lsn: Option, + l2_lsn: Option, + + /// State for the garbage-collecting compaction pass. + /// + /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside + /// the PITR window and not needed by child timelines. + /// + /// A commonly used synonym for this compaction pass is + /// "bottommost-compaction" because the affected LSN range + /// is the "bottom" of the (key,lsn) map. + /// + /// Gc-compaction is a quite expensive operation; that's why we use + /// trigger condition. + /// This field here holds the state pertaining to that trigger condition + /// and (in future) to the progress of the gc-compaction, so that it's + /// resumable across restarts & migrations. + /// + /// Note that the underlying algorithm is _also_ called `gc-compaction` + /// in most places & design docs; but in fact it is more flexible than + /// just the specific use case here; it needs a new name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_compaction: Option, +} + +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct GcCompactionState { + /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN. + pub(crate) last_completed_lsn: Lsn, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -123,10 +150,11 @@ impl IndexPart { /// - 10: +import_pgdata /// - 11: +rel_size_migration /// - 12: +l2_lsn - const LATEST_VERSION: usize = 12; + /// - 13: +gc_compaction + const LATEST_VERSION: usize = 13; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -144,6 +172,7 @@ impl IndexPart { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, } } @@ -406,10 +435,12 @@ impl GcBlocking { #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use utils::id::TimelineId; + use super::*; + #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -450,6 +481,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -497,6 +529,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -545,6 +578,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -596,6 +630,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -642,6 +677,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -691,6 +727,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -745,6 +782,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -804,6 +842,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -864,6 +903,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -929,6 +969,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1007,6 +1048,7 @@ mod tests { }))), rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1086,6 +1128,7 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1093,7 +1136,7 @@ mod tests { } #[test] - fn v12_l2_lsn_is_parsed() { + fn v12_v13_l2_gc_ompaction_is_parsed() { let example = r#"{ "version": 12, "layer_metadata":{ @@ -1124,7 +1167,10 @@ mod tests { } }, "rel_size_migration": "legacy", - "l2_lsn": "0/16960E8" + "l2_lsn": "0/16960E8", + "gc_compaction": { + "last_completed_lsn": "0/16960E8" + } }"#; let expected = IndexPart { @@ -1166,6 +1212,9 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), + gc_compaction: Some(GcCompactionState { + last_completed_lsn: "0/16960E8".parse::().unwrap(), + }), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 2029847a12..543ccc219d 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,6 +1,7 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Tenant-shard scoped manifest #[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index af4dbbbfb6..7d9f47665a 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,28 +1,28 @@ //! Helper functions to upload files to remote storage with a RemoteStorage -use anyhow::{bail, Context}; +use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; + +use anyhow::{Context, bail}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::{ErrorKind, SeekFrom}; -use std::time::SystemTime; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; +use super::Generation; use super::index::IndexPart; use super::manifest::TenantManifest; -use super::Generation; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, }; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; -use utils::id::{TenantId, TimelineId}; - -use tracing::info; /// Serializes and uploads the given index part data to the remote storage. pub(crate) async fn upload_index_part( @@ -134,7 +134,9 @@ pub(super) async fn upload_timeline_layer<'a>( .len(); if metadata_size != fs_size { - bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!( + "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ); } let fs_size = usize::try_from(fs_size) diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 4bc208331b..8f8622c796 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -3,40 +3,31 @@ pub mod heatmap; mod heatmap_uploader; mod scheduler; -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; -use crate::{ - context::RequestContext, - disk_usage_eviction_task::DiskUsageEvictionInfo, - metrics::SECONDARY_HEATMAP_TOTAL_SIZE, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, -}; - -use self::{ - downloader::{downloader_task, SecondaryDetail}, - heatmap_uploader::heatmap_uploader_task, -}; - -use super::{ - config::{SecondaryLocationConfig, TenantConfOpt}, - mgr::TenantManager, - span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerName, - GetTenantError, -}; - -use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; use metrics::UIntGauge; -use pageserver_api::{ - models, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use remote_storage::GenericRemoteStorage; - use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; +use utils::completion::Barrier; +use utils::id::TimelineId; +use utils::sync::gate::Gate; + +use self::downloader::{SecondaryDetail, downloader_task}; +use self::heatmap_uploader::heatmap_uploader_task; +use super::GetTenantError; +use super::config::{SecondaryLocationConfig, TenantConfOpt}; +use super::mgr::TenantManager; +use super::span::debug_assert_current_span_has_tenant_id; +use super::storage_layer::LayerName; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::DiskUsageEvictionInfo; +use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; enum DownloadCommand { Download(TenantShardId), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 2e8c3946bd..a13b9323ac 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1,47 +1,8 @@ -use std::{ - collections::{HashMap, HashSet}, - pin::Pin, - str::FromStr, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - disk_usage_eviction_task::{ - finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, - }, - metrics::SECONDARY_MODE, - tenant::{ - config::SecondaryLocationConfig, - debug_assert_current_span_has_tenant_and_timeline_id, - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - }, - span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - }, - virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - TEMP_FILE_SUFFIX, -}; - -use super::{ - heatmap::HeatMapLayer, - scheduler::{ - self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, - TenantBackgroundJobs, - }, - GetTenantError, SecondaryTenant, SecondaryTenantError, -}; - -use crate::tenant::{ - mgr::TenantManager, - remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, -}; +use std::collections::{HashMap, HashSet}; +use std::pin::Pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; @@ -50,18 +11,43 @@ use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; - use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, warn, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, pausable_failpoint, serde_system_time, -}; +use tracing::{Instrument, info_span, instrument, warn}; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::TimelineId; +use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time}; -use super::{ - heatmap::{HeatMapTenant, HeatMapTimeline}, - CommandRequest, DownloadCommand, +use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline}; +use super::scheduler::{ + self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{ + CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError, +}; +use crate::TEMP_FILE_SUFFIX; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::{ + DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32, +}; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::config::SecondaryLocationConfig; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::download::download_layer_file; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file, + remote_heatmap_path, +}; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint}; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// For each tenant, default period for how long must have passed since the last download_tenant call before /// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 0fa10ca294..4a938e9095 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,11 +1,13 @@ -use std::{collections::HashMap, time::SystemTime}; - -use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; +use std::collections::HashMap; +use std::time::SystemTime; use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; +use serde_with::{DisplayFromStr, TimestampSeconds, serde_as}; +use utils::generation::Generation; +use utils::id::TimelineId; -use utils::{generation::Generation, id::TimelineId}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::LayerName; #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTenant { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index d72c337369..3375714a66 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,42 +1,33 @@ -use std::{ - collections::HashMap, - pin::Pin, - sync::{Arc, Weak}, - time::{Duration, Instant}, -}; - -use crate::{ - metrics::SECONDARY_MODE, - tenant::{ - config::AttachmentMode, - mgr::{GetTenantError, TenantManager}, - remote_timeline_client::remote_heatmap_path, - span::debug_assert_current_span_has_tenant_id, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - Tenant, - }, - virtual_file::VirtualFile, - TEMP_FILE_SUFFIX, -}; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::{Arc, Weak}; +use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; - -use super::{ - heatmap::HeatMapTenant, - scheduler::{ - self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, - TenantBackgroundJobs, - }, - CommandRequest, SecondaryTenantError, UploadCommand, -}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, - yielding_loop::yielding_loop, +use tracing::{Instrument, info_span, instrument}; +use utils::backoff; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::yielding_loop::yielding_loop; + +use super::heatmap::HeatMapTenant; +use super::scheduler::{ + self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{CommandRequest, SecondaryTenantError, UploadCommand}; +use crate::TEMP_FILE_SUFFIX; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::Tenant; +use crate::tenant::config::AttachmentMode; +use crate::tenant::mgr::{GetTenantError, TenantManager}; +use crate::tenant::remote_timeline_client::remote_heatmap_path; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::VirtualFile; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index e963c722b9..f948f9114f 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,16 +1,15 @@ -use futures::Future; -use rand::Rng; -use std::{ - collections::HashMap, - marker::PhantomData, - pin::Pin, - time::{Duration, Instant}, -}; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::pin::Pin; +use std::time::{Duration, Instant}; +use futures::Future; use pageserver_api::shard::TenantShardId; +use rand::Rng; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use utils::{completion::Barrier, yielding_loop::yielding_loop}; +use utils::completion::Barrier; +use utils::yielding_loop::yielding_loop; use super::{CommandRequest, CommandResponse, SecondaryTenantError}; diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 1e84a9d9dc..ed6b351c75 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -4,21 +4,18 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tenant_size_model::svg::SvgBranchKind; -use tokio::sync::oneshot::error::RecvError; +use tenant_size_model::{Segment, StorageModel}; use tokio::sync::Semaphore; +use tokio::sync::oneshot::error::RecvError; use tokio_util::sync::CancellationToken; - -use crate::context::RequestContext; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; - -use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::{MaybeOffloaded, Timeline}; +use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; -use tracing::*; - -use tenant_size_model::{Segment, StorageModel}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; +use crate::context::RequestContext; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::tenant::{MaybeOffloaded, Timeline}; /// Inputs to the actual tenant sizing model /// @@ -498,7 +495,9 @@ async fn fill_logical_sizes( } Err(join_error) => { // cannot really do anything, as this panic is likely a bug - error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + error!( + "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}" + ); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(join_error) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f9f843ef6b..7f313f46a2 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,42 +10,39 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; -use crate::config::PageServerConf; -use crate::context::{AccessStatsBehavior, RequestContext}; -use bytes::Bytes; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::future::Future; use std::ops::Range; use std::pin::Pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; +use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::{trace, Instrument}; -use utils::sync::gate::GateGuard; - -use utils::lsn::Lsn; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; +use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; +pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; - -pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::value::Value; +use tracing::{Instrument, trace}; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; - -use super::timeline::{GetVectoredError, ReadPath}; use super::PageReconstructError; +use super::timeline::{GetVectoredError, ReadPath}; +use crate::config::PageServerConf; +use crate::context::{AccessStatsBehavior, RequestContext}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -510,6 +507,7 @@ impl IoConcurrency { #[cfg(test)] pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { use std::ops::{Deref, DerefMut}; + use tracing::info; use utils::sync::gate::Gate; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 7da51c27df..fd50e4805d 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -1,17 +1,22 @@ -use std::{future::Future, ops::Range, sync::Arc}; +use std::future::Future; +use std::ops::Range; +use std::sync::Arc; use bytes::Bytes; -use pageserver_api::key::{Key, KEY_SIZE}; -use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; - -use crate::tenant::storage_layer::Layer; -use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::shard::TenantShardId; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, }; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::tenant::Timeline; +use crate::tenant::storage_layer::Layer; pub(crate) enum BatchWriterResult { Produced(ResidentLayer), @@ -423,15 +428,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::AsLayerDesc, - }, - DEFAULT_PG_VERSION, - }; - use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::AsLayerDesc; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 885c50425f..d9afdc2405 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -27,6 +27,38 @@ //! "values" part. The actual page images and WAL records are stored in the //! "values" part. //! +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::fs::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBuf; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -42,44 +74,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::fs::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBuf; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; +use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file @@ -967,7 +963,10 @@ impl DeltaLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None @@ -1128,10 +1127,11 @@ impl DeltaLayerInner { until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { + use futures::stream::TryStreamExt; + use crate::tenant::vectored_blob_io::{ BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; - use futures::stream::TryStreamExt; #[derive(Debug)] enum Item { @@ -1597,23 +1597,21 @@ impl DeltaLayerIterator<'_> { pub(crate) mod test { use std::collections::BTreeMap; + use bytes::Bytes; use itertools::MinMaxResult; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use pageserver_api::value::Value; use rand::RngCore; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use super::*; - use crate::tenant::harness::TIMELINE_ID; + use crate::DEFAULT_PG_VERSION; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::disk_btree::tests::TestDisk; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{Tenant, Timeline}; - use crate::{ - context::DownloadBehavior, - task_mgr::TaskKind, - tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, - DEFAULT_PG_VERSION, - }; - use bytes::Bytes; - use pageserver_api::value::Value; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8660be1fcc..8d172a1c19 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -1,18 +1,14 @@ -use std::{ops::Range, sync::Arc}; +use std::ops::Range; +use std::sync::Arc; use anyhow::bail; -use pageserver_api::{ - key::Key, - keyspace::{KeySpace, SparseKeySpace}, -}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; +use pageserver_api::value::Value; use utils::lsn::Lsn; -use pageserver_api::value::Value; - -use super::{ - merge_iterator::{MergeIterator, MergeIteratorItem}, - PersistentLayerKey, -}; +use super::PersistentLayerKey; +use super::merge_iterator::{MergeIterator, MergeIteratorItem}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// @@ -98,19 +94,14 @@ impl<'a> FilterIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::produce_delta_layer, - }, - DEFAULT_PG_VERSION, - }; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer; async fn assert_filter_iter_equal( filter_iter: &mut FilterIterator<'_>, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c49281dc45..0db9e8c845 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,6 +25,39 @@ //! layer, and offsets to the other parts. The "index" is a B-tree, //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::prelude::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use hex; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_stream::StreamExt; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -39,44 +72,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use hex; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::prelude::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_stream::StreamExt; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; /// /// Header stored in the beginning of the file @@ -603,7 +600,10 @@ impl ImageLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None @@ -1133,34 +1133,26 @@ impl ImageLayerIterator<'_> { #[cfg(test)] mod test { - use std::{sync::Arc, time::Duration}; + use std::sync::Arc; + use std::time::Duration; use bytes::Bytes; use itertools::Itertools; - use pageserver_api::{ - key::Key, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, - value::Value, - }; - use utils::{ - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, - }; - - use crate::{ - context::RequestContext, - tenant::{ - config::TenantConf, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::{Layer, ResidentLayer}, - vectored_blob_io::StreamingVectoredReadPlanner, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; + use pageserver_api::key::Key; + use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; + use pageserver_api::value::Value; + use utils::generation::Generation; + use utils::id::{TenantId, TimelineId}; + use utils::lsn::Lsn; use super::{ImageLayerIterator, ImageLayerWriter}; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::TenantConf; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; #[tokio::test] async fn image_layer_rewrite() { @@ -1170,10 +1162,10 @@ mod test { ..TenantConf::default() }; let tenant_id = TenantId::generate(); - let mut gen = Generation::new(0xdead0001); + let mut gen_ = Generation::new(0xdead0001); let mut get_next_gen = || { - let ret = gen; - gen = gen.next(); + let ret = gen_; + gen_ = gen_.next(); ret }; // The LSN at which we will create an image layer to filter diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 61a0fdea8c..ffdfe1dc27 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,38 +4,39 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! -use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}; +use std::sync::{Arc, OnceLock}; +use std::time::Instant; + +use anyhow::Result; +use camino::Utf8PathBuf; +use pageserver_api::key::{CompactKey, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::InMemoryLayerInfo; +use pageserver_api::shard::TenantShardId; +use tokio::sync::RwLock; +use tracing::*; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; +use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; + +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::Result; -use camino::Utf8PathBuf; -use pageserver_api::key::CompactKey; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::InMemoryLayerInfo; -use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, HashMap}; -use std::sync::{Arc, OnceLock}; -use std::time::Instant; -use tracing::*; -use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap}; -use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; -// avoid binding to Write (conflicts with std::io::Write) -// while being able to use std::fmt::Write's methods -use crate::metrics::TIMELINE_EPHEMERAL_BYTES; -use std::cmp::Ordering; -use std::fmt::Write; -use std::ops::Range; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::RwLock; - -use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; pub(crate) mod vectored_dio_read; @@ -555,7 +556,9 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { - trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); + trace!( + "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}" + ); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); @@ -816,8 +819,7 @@ mod tests { #[test] fn test_index_entry() { const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; - use IndexEntryNewArgs as Args; - use IndexEntryUnpacked as Unpacked; + use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked}; let roundtrip = |args, expect: Unpacked| { let res = IndexEntry::new(args).expect("this tests expects no errors"); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 1d86015fab..90455fd0ca 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -1,16 +1,13 @@ -use std::{ - collections::BTreeMap, - sync::{Arc, RwLock}, -}; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; use itertools::Itertools; use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; -use crate::{ - assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, - context::RequestContext, - virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, -}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; +use crate::context::RequestContext; +use crate::virtual_file::IoBufferMut; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. pub trait File: Send { @@ -132,7 +129,9 @@ where let req_len = match cur { LogicalReadState::NotStarted(buf) => { if buf.len() != 0 { - panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + panic!( + "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`" + ); } // buf.cap() == 0 is ok @@ -141,7 +140,9 @@ where *state = LogicalReadState::Ongoing(buf); req_len } - x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + x => panic!( + "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}" + ), }; // plan which chunks we need to read from @@ -422,15 +423,15 @@ impl Buffer for Vec { #[cfg(test)] #[allow(clippy::assertions_on_constants)] mod tests { + use std::cell::RefCell; + use std::collections::VecDeque; + use rand::Rng; - use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, - virtual_file::owned_buffers_io::slice::SliceMutExt, - }; - use super::*; - use std::{cell::RefCell, collections::VecDeque}; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; struct InMemoryFile { content: Vec, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 0bf606cf0a..ae06aca63b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,32 +1,32 @@ +use std::ops::Range; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime}; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use std::ops::Range; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::{Arc, Weak}; -use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; -use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::task_mgr::TaskKind; -use crate::tenant::timeline::{CompactionError, GetVectoredError}; -use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; - use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; - -use utils::generation::Generation; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; #[cfg(test)] mod tests; @@ -1873,8 +1873,8 @@ impl ResidentLayer { self.owner.record_access(ctx); let res = match inner { - Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, - Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, + Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, + Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, }; res.with_context(|| format!("Layer index is corrupted for {self}")) } @@ -1920,7 +1920,7 @@ impl ResidentLayer { let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { - Delta(ref d) => d + Delta(d) => d .copy_prefix(writer, until, ctx) .await .with_context(|| format!("copy_delta_prefix until {until} of {self}")), @@ -1943,7 +1943,7 @@ impl ResidentLayer { ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(ref d) => Ok(d), + Delta(d) => Ok(d), Image(_) => Err(anyhow::anyhow!("image layer")), } } @@ -1955,7 +1955,7 @@ impl ResidentLayer { ) -> anyhow::Result<&image_layer::ImageLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Image(ref d) => Ok(d), + Image(d) => Ok(d), Delta(_) => Err(anyhow::anyhow!("delta layer")), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index d93c378ffc..724150d27f 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,22 +1,16 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::{Key, CONTROLFILE_KEY}; +use pageserver_api::key::{CONTROLFILE_KEY, Key}; use tokio::task::JoinSet; -use utils::{ - completion::{self, Completion}, - id::TimelineId, -}; +use utils::completion::{self, Completion}; +use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{ - context::DownloadBehavior, - tenant::{ - harness::test_img, - storage_layer::{IoConcurrency, LayerVisibilityHint}, - }, -}; -use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; +use crate::context::DownloadBehavior; +use crate::task_mgr::TaskKind; +use crate::tenant::harness::{TenantHarness, test_img}; +use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); @@ -771,10 +765,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let (arrival, _download_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - let mut download = std::pin::pin!(layer - .0 - .get_or_maybe_download(true, None) - .instrument(download_span)); + let mut download = std::pin::pin!( + layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span) + ); assert!( !layer.is_likely_resident(), diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2097e90764..ed16dcaa0d 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,16 +1,15 @@ use core::fmt::Display; -use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{id::TimelineId, lsn::Lsn}; use pageserver_api::key::Key; - -use super::{DeltaLayerName, ImageLayerName, LayerName}; - +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; - #[cfg(test)] use utils::id::TenantId; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use super::{DeltaLayerName, ImageLayerName, LayerName}; /// A unique identifier of a persistent layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index addf3b85d9..0f7995f87b 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -1,12 +1,12 @@ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! -use pageserver_api::key::Key; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use pageserver_api::key::Key; use utils::lsn::Lsn; use super::PersistentLayerDesc; @@ -305,7 +305,7 @@ impl FromStr for LayerName { (None, None) => { return Err(format!( "neither delta nor image layer file name: {value:?}" - )) + )); } (Some(delta), None) => Self::Delta(delta), (None, Some(image)) => Self::Image(image), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 19cfcb0867..76cdddd06a 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -1,21 +1,16 @@ -use std::{ - cmp::Ordering, - collections::{binary_heap, BinaryHeap}, - sync::Arc, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, binary_heap}; +use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; +use pageserver_api::value::Value; use utils::lsn::Lsn; +use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; +use super::image_layer::{ImageLayerInner, ImageLayerIterator}; +use super::{PersistentLayerDesc, PersistentLayerKey}; use crate::context::RequestContext; -use pageserver_api::value::Value; - -use super::{ - delta_layer::{DeltaLayerInner, DeltaLayerIterator}, - image_layer::{ImageLayerInner, ImageLayerIterator}, - PersistentLayerDesc, PersistentLayerKey, -}; #[derive(Clone, Copy)] pub(crate) enum LayerRef<'a> { @@ -349,24 +344,18 @@ impl<'a> MergeIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; - use utils::lsn::Lsn; - - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, - }, - DEFAULT_PG_VERSION, - }; - - #[cfg(feature = "testing")] - use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; + use utils::lsn::Lsn; + + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + #[cfg(feature = "testing")] + use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; + use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}; async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5e63f59fd8..c90f81889b 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,24 +8,24 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use rand::Rng; use scopeguard::defer; use tokio::sync::{Semaphore, SemaphorePermit}; use tokio_util::sync::CancellationToken; use tracing::*; - -use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS}; -use crate::tenant::throttle::Stats; -use crate::tenant::timeline::compaction::CompactionOutcome; -use crate::tenant::timeline::CompactionError; -use crate::tenant::{Tenant, TenantState}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::throttle::Stats; +use crate::tenant::timeline::CompactionError; +use crate::tenant::timeline::compaction::CompactionOutcome; +use crate::tenant::{Tenant, TenantState}; + /// Semaphore limiting concurrent background tasks (across all tenants). /// /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. @@ -287,15 +287,16 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { - use crate::pgdatadir_mapping::CollectKeySpaceError; - use crate::tenant::upload_queue::NotInitialized; - use crate::tenant::PageReconstructError; use CompactionError::*; + use crate::tenant::PageReconstructError; + use crate::tenant::upload_queue::NotInitialized; + let level = match err { + e if e.is_cancel() => return, ShuttingDown => return, Offload(_) => Level::ERROR, - CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 300d779125..6c37c3771b 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::Instant, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; use arc_swap::ArcSwap; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 48c208d5d7..dfa50d498c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,55 +14,6 @@ pub mod span; pub mod uninit; mod walreceiver; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use arc_swap::{ArcSwap, ArcSwapOption}; -use bytes::Bytes; -use camino::Utf8Path; -use chrono::{DateTime, Utc}; -use compaction::CompactionOutcome; -use enumset::EnumSet; -use fail::fail_point; -use futures::FutureExt; -use futures::{stream::FuturesUnordered, StreamExt}; -use handle::ShardTimelineId; -use layer_manager::Shutdown; -use offload::OffloadError; -use once_cell::sync::Lazy; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::{ - key::{ - KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - SPARSE_RANGE, - }, - keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, - models::{ - CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, - }, - reltag::BlockNumber, - shard::{ShardIdentity, ShardNumber, TenantShardId}, -}; -use rand::Rng; -use remote_storage::DownloadError; -use serde_with::serde_as; -use storage_broker::BrokerClientChannel; -use tokio::runtime::Handle; -use tokio::sync::mpsc::Sender; -use tokio::sync::{oneshot, watch, Notify}; -use tokio_util::sync::CancellationToken; -use tracing::*; -use utils::critical; -use utils::rate_limit::RateLimit; -use utils::{ - fs_ext, - guard_arc_swap::GuardArcSwap, - pausable_failpoint, - postgres_client::PostgresClientProtocol, - sync::gate::{Gate, GateGuard}, -}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; - use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; @@ -72,74 +23,58 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::l0_flush::{self, L0FlushGlobalState}; -use crate::tenant::storage_layer::ImageLayerName; -use crate::{ - aux_file::AuxFileSizeEstimator, - page_service::TenantManagerTypes, - tenant::{ - config::AttachmentMode, - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, - storage_layer::{ - inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc, - ValueReconstructSituation, - }, - }, - walingest::WalLagCooldown, - walredo, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - disk_usage_eviction_task::DiskUsageEvictionInfo, - pgdatadir_mapping::CollectKeySpaceError, -}; -use crate::{ - disk_usage_eviction_task::finite_f32, - tenant::storage_layer::{ - AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, - ValuesReconstructState, - }, -}; -use crate::{ - disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, -}; -use crate::{ - metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, -}; -use crate::{ - pgdatadir_mapping::DirectoryKind, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; +use anyhow::{Context, Result, anyhow, bail, ensure}; +use arc_swap::{ArcSwap, ArcSwapOption}; +use bytes::Bytes; +use camino::Utf8Path; +use chrono::{DateTime, Utc}; +use compaction::{CompactionOutcome, GcCompactionCombinedSettings}; +use enumset::EnumSet; +use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; +use handle::ShardTimelineId; +use layer_manager::Shutdown; +use offload::OffloadError; +use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; - -use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; -use crate::tenant::config::TenantConfOpt; -use pageserver_api::reltag::RelTag; -use pageserver_api::shard::ShardIndex; - -use postgres_connection::PgConnectionConfig; -use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE}; -use utils::{ - completion, - generation::Generation, - id::TimelineId, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, - simple_rcu::{Rcu, RcuReadGuard}, +use pageserver_api::key::{ + KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + SPARSE_RANGE, }; - -use crate::task_mgr; -use crate::task_mgr::TaskKind; -use crate::tenant::gc_result::GcResult; -use crate::ZERO_PAGE; -use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; +use pageserver_api::models::{ + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, +}; +use pageserver_api::reltag::{BlockNumber, RelTag}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; +#[cfg(test)] +use pageserver_api::value::Value; +use postgres_connection::PgConnectionConfig; +use postgres_ffi::v14::xlog_utils; +use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; +use rand::Rng; +use remote_storage::DownloadError; +use serde_with::serde_as; +use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; +use tokio::sync::mpsc::Sender; +use tokio::sync::{Notify, oneshot, watch}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::generation::Generation; +use utils::guard_arc_swap::GuardArcSwap; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; +use utils::postgres_client::PostgresClientProtocol; +use utils::rate_limit::RateLimit; +use utils::seqwait::SeqWait; +use utils::simple_rcu::{Rcu, RcuReadGuard}; +use utils::sync::gate::{Gate, GateGuard}; +use utils::{completion, critical, fs_ext, pausable_failpoint}; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; @@ -147,23 +82,48 @@ use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; - +use super::config::TenantConf; +use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; +use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; +use super::secondary::heatmap::HeatMapLayer; +use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::upload_queue::NotInitialized; use super::{ - config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, - MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + debug_assert_current_span_has_tenant_and_timeline_id, }; -use super::{ - debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline, +use crate::aux_file::AuxFileSizeEstimator; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::l0_flush::{self, L0FlushGlobalState}; +use crate::metrics::{ + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; -use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{ - remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, - storage_layer::ReadableLayer, +use crate::page_service::TenantManagerTypes; +use crate::pgdatadir_mapping::{ + CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp, + MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; -use super::{secondary::heatmap::HeatMapLayer, GcError}; - -#[cfg(test)] -use pageserver_api::value::Value; +use crate::task_mgr::TaskKind; +use crate::tenant::config::{AttachmentMode, TenantConfOpt}; +use crate::tenant::gc_result::GcResult; +use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::storage_layer::delta_layer::DeltaEntry; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; +use crate::tenant::storage_layer::{ + AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName, + ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation, + ValueReconstructState, ValuesReconstructState, +}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::walingest::WalLagCooldown; +use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(crate) enum FlushLoopState { @@ -323,6 +283,9 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, + // The LSN of gc-compaction that was last applied to this timeline. + gc_compaction_state: ArcSwap>, + pub(super) metrics: TimelineMetrics, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code @@ -468,7 +431,7 @@ pub struct Timeline { /// If Some, collects GetPage metadata for an ongoing PageTrace. pub(crate) page_trace: ArcSwapOption>, - previous_heatmap: ArcSwapOption, + pub(super) previous_heatmap: ArcSwapOption, /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. @@ -1470,13 +1433,22 @@ impl Timeline { | TaskKind::WalReceiverConnectionHandler | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { - WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), - WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + WaitLsnWaiter::Timeline(waiter) => { + Weak::ptr_eq(&waiter.myself, &self.myself) + } + WaitLsnWaiter::Tenant + | WaitLsnWaiter::PageService + | WaitLsnWaiter::HttpEndpoint => unreachable!( + "tenant or page_service context are not expected to have task kind {:?}", + ctx.task_kind() + ), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here - panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + panic!( + "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock" + ); } } else { // if another timeline's is waiting for us, there's no deadlock risk because @@ -1505,12 +1477,12 @@ impl Timeline { drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", - lsn, - self.get_last_record_lsn(), - self.get_disk_consistent_lsn(), - walreceiver_status, - ))) + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status, + ))) } } } @@ -1614,10 +1586,18 @@ impl Timeline { if init || validate { let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); } if lsn < planned_cutoff { - bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff); + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } } @@ -1741,7 +1721,9 @@ impl Timeline { // This is not harmful, but it only happens in relatively rare cases where // time-based checkpoints are not happening fast enough to keep the amount of // ephemeral data within configured limits. It's a sign of stress on the system. - tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + tracing::info!( + "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure" + ); } } @@ -1867,7 +1849,9 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { - warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + warn!( + "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}" + ); return Ok(CompactionOutcome::Skipped); } @@ -1880,15 +1864,25 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. - match result { + match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { + Err(e) if e.is_cancel() => {} + Err(CompactionError::ShuttingDown) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::AlreadyRunning(_)) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + Err(CompactionError::CollectKeySpaceError(_)) => { + // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to // abruptly stall nor resume L0 flushes in these cases. Err(CompactionError::Offload(_)) => {} - Err(CompactionError::ShuttingDown) => {} }; result @@ -2028,7 +2022,9 @@ impl Timeline { // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but // we also do a final check here to ensure that the queue is empty. if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } } @@ -2037,7 +2033,9 @@ impl Timeline { // drain the upload queue self.remote_client.shutdown().await; if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } @@ -2531,6 +2529,31 @@ impl Timeline { ) } + fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings { + let tenant_conf = &self.tenant_conf.load(); + let gc_compaction_enabled = tenant_conf + .tenant_conf + .gc_compaction_enabled + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_initial_threshold_kb = tenant_conf + .tenant_conf + .gc_compaction_initial_threshold_kb + .unwrap_or( + self.conf + .default_tenant_conf + .gc_compaction_initial_threshold_kb, + ); + let gc_compaction_ratio_percent = tenant_conf + .tenant_conf + .gc_compaction_ratio_percent + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); + GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } + } + fn get_image_creation_preempt_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2609,6 +2632,7 @@ impl Timeline { state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, + gc_compaction_state: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2667,6 +2691,8 @@ impl Timeline { }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), @@ -2831,6 +2857,20 @@ impl Timeline { ); } + pub(crate) fn update_gc_compaction_state( + &self, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + self.gc_compaction_state + .store(Arc::new(Some(gc_compaction_state.clone()))); + self.remote_client + .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) + } + + pub(crate) fn get_gc_compaction_state(&self) -> Option { + self.gc_compaction_state.load_full().as_ref().clone() + } + /// Creates and starts the wal receiver. /// /// This function is expected to be called at most once per Timeline's lifecycle @@ -2874,6 +2914,7 @@ impl Timeline { auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), ingest_batch_size: self.conf.ingest_batch_size, + validate_wal_contiguity: self.conf.validate_wal_contiguity, }, broker_client, ctx, @@ -2898,8 +2939,9 @@ impl Timeline { disk_consistent_lsn: Lsn, index_part: IndexPart, ) -> anyhow::Result<()> { - use init::{Decision::*, Discovered, DismissedLayer}; use LayerName::*; + use init::Decision::*; + use init::{Discovered, DismissedLayer}; let mut guard = self.layers.write().await; @@ -3114,11 +3156,15 @@ impl Timeline { } TimelineState::Loading => { // Import does not return an activated timeline. - info!("discarding priority boost for logical size calculation because timeline is not yet active"); + info!( + "discarding priority boost for logical size calculation because timeline is not yet active" + ); } TimelineState::Active => { // activation should be setting the once cell - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + warn!( + "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work" + ); debug_assert!(false); } } @@ -3523,6 +3569,14 @@ impl Timeline { Ok(layer) } + pub(super) fn is_previous_heatmap_active(&self) -> bool { + self.previous_heatmap + .load() + .as_ref() + .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) + .unwrap_or(false) + } + /// The timeline heatmap is a hint to secondary locations from the primary location, /// indicating which layers are currently on-disk on the primary. /// @@ -3595,6 +3649,7 @@ impl Timeline { Some(non_resident) => { let mut non_resident = non_resident.peekable(); if non_resident.peek().is_none() { + tracing::info!(timeline_id=%self.timeline_id, "Previous heatmap now obsolete"); self.previous_heatmap .store(Some(PreviousHeatmap::Obsolete.into())); } @@ -3626,6 +3681,36 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap { + let guard = self.layers.read().await; + + let now = SystemTime::now(); + let mut heatmap_layers = Vec::default(); + for vl in guard.visible_layers() { + if vl.layer_desc().get_lsn_range().start >= end_lsn { + continue; + } + + let hl = HeatMapLayer { + name: vl.layer_desc().layer_name(), + metadata: vl.metadata(), + access_time: now, + }; + heatmap_layers.push(hl); + } + + tracing::info!( + "Generating unarchival heatmap with {} layers", + heatmap_layers.len() + ); + + let heatmap = HeatMapTimeline::new(self.timeline_id, heatmap_layers); + PreviousHeatmap::Active { + heatmap, + read_at: Instant::now(), + } + } + /// Returns true if the given lsn is or was an ancestor branchpoint. pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original @@ -4219,10 +4304,14 @@ impl Timeline { // This path is only taken for tenants with multiple shards: single sharded tenants should // never encounter a gap in the wal. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + tracing::debug!( + "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}" + ); if self.set_disk_consistent_lsn(frozen_to_lsn) { if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { - tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + tracing::warn!( + "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}" + ); } } } @@ -4447,7 +4536,10 @@ impl Timeline { /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { let old_value = self.disk_consistent_lsn.fetch_max(new_value); - assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + assert!( + new_value >= old_value, + "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}" + ); self.metrics .disk_consistent_lsn_gauge @@ -4605,10 +4697,7 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self - .collect_keyspace(lsn, ctx) - .await - .map_err(CompactionError::CollectKeySpaceError)?; + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -4742,7 +4831,9 @@ impl Timeline { // any metadata keys, keys, as that would lead to actual data // loss. if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + warn!( + "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}" + ); ZERO_PAGE.clone() } else { return Err(CreateImageLayersError::from(err)); @@ -4821,7 +4912,8 @@ impl Timeline { let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; info!( - "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() + "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", + elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { @@ -5143,7 +5235,8 @@ impl Timeline { if should_yield { tracing::info!( "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", - partition.start().unwrap(), partition.end().unwrap() + partition.start().unwrap(), + partition.end().unwrap() ); last_partition_processed = Some(partition.clone()); all_generated = false; @@ -5330,9 +5423,40 @@ pub(crate) enum CompactionError { Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(CollectKeySpaceError), + CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), + #[error("Compaction already running: {0}")] + AlreadyRunning(&'static str), +} + +impl CompactionError { + /// Errors that can be ignored, i.e., cancel and shutdown. + pub fn is_cancel(&self) -> bool { + matches!( + self, + Self::ShuttingDown + | Self::AlreadyRunning(_) + | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) + | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( + PageReconstructError::Cancelled + )) + | Self::Offload(OffloadError::Cancelled) + ) + } + + /// Critical errors that indicate data corruption. + pub fn is_critical(&self) -> bool { + matches!( + self, + Self::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ) + ) + ) + } } impl From for CompactionError { @@ -5344,24 +5468,6 @@ impl From for CompactionError { } } -impl CompactionError { - pub fn is_cancelled(&self) -> bool { - matches!(self, CompactionError::ShuttingDown) - } -} - -impl From for CompactionError { - fn from(err: CollectKeySpaceError) -> Self { - match err { - CollectKeySpaceError::Cancelled - | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { - CompactionError::ShuttingDown - } - e => CompactionError::Other(e.into()), - } - } -} - impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { @@ -5505,7 +5611,9 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { - return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + return Err(CompactionError::Other(anyhow::anyhow!( + "compaction generates a L0 layer file as output, which will cause infinite compaction." + ))); } else { insert_layers.push(l.clone()); } @@ -5629,8 +5737,10 @@ impl Timeline { .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", - index_part.metadata.latest_gc_cutoff_lsn()); + tracing::info!( + "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn() + ); Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) } Err(DownloadError::NotFound) => { @@ -6039,9 +6149,7 @@ impl Timeline { if let Some((img_lsn, img)) = &data.img { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", - key, - img_lsn, - request_lsn, + key, img_lsn, request_lsn, ); Ok(img.clone()) } else { @@ -6070,7 +6178,12 @@ impl Timeline { request_lsn ); } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + trace!( + "found {} WAL records that will init the page for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); }; let res = self .walredo_mgr @@ -6614,7 +6727,9 @@ impl TimelineWriter<'_> { if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { - info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + debug!( + "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers" + ); self.tl.wait_flush_completion(flush_id).await?; } } @@ -6801,17 +6916,15 @@ mod tests { use pageserver_api::key::Key; use pageserver_api::value::Value; use tracing::Instrument; - use utils::{id::TimelineId, lsn::Lsn}; - - use crate::tenant::{ - harness::{test_img, TenantHarness}, - layer_map::LayerMap, - storage_layer::{Layer, LayerName, LayerVisibilityHint}, - timeline::{DeltaLayerTestDesc, EvictionError}, - PreviousHeatmap, Timeline, - }; + use utils::id::TimelineId; + use utils::lsn::Lsn; use super::HeatMapTimeline; + use crate::tenant::harness::{TenantHarness, test_img}; + use crate::tenant::layer_map::LayerMap; + use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; + use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; + use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { assert_eq!(lhs.layers.len(), rhs.layers.len()); diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index 6009b0b79a..96864ec44b 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -1,4 +1,5 @@ -use std::{collections::BTreeSet, ops::Range}; +use std::collections::BTreeSet; +use std::ops::Range; use utils::lsn::Lsn; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 4e4f906d78..091bd583d7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -10,34 +10,42 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, + GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::key::KEY_SIZE; -use pageserver_api::keyspace::ShardedRange; +use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; +use pageserver_api::key::{KEY_SIZE, Key}; +use pageserver_api::keyspace::{KeySpace, ShardedRange}; use pageserver_api::models::CompactInfoResponse; +use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use pageserver_compaction::helpers::{fully_contains, overlaps_with}; +use pageserver_compaction::interface::*; use serde::Serialize; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, trace, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, trace, warn}; use utils::critical; use utils::id::TimelineId; +use utils::lsn::Lsn; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, }; @@ -46,24 +54,12 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; -use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; -use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; +use crate::tenant::timeline::{ + DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, + ResidentLayer, drop_rlock, +}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; - -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; - -use utils::lsn::Lsn; - -use pageserver_compaction::helpers::{fully_contains, overlaps_with}; -use pageserver_compaction::interface::*; - -use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; @@ -77,13 +73,22 @@ impl std::fmt::Display for GcCompactionJobId { } } +pub struct GcCompactionCombinedSettings { + pub gc_compaction_enabled: bool, + pub gc_compaction_initial_threshold_kb: u64, + pub gc_compaction_ratio_percent: u64, +} + #[derive(Debug, Clone)] pub enum GcCompactionQueueItem { - Manual(CompactOptions), + MetaJob { + /// Compaction options + options: CompactOptions, + /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) + auto: bool, + }, SubCompactionJob(CompactOptions), - #[allow(dead_code)] - UpdateL2Lsn(Lsn), - Notify(GcCompactionJobId), + Notify(GcCompactionJobId, Option), } impl GcCompactionQueueItem { @@ -93,7 +98,7 @@ impl GcCompactionQueueItem { running: bool, ) -> Option { match self { - GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse { + GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, @@ -107,17 +112,22 @@ impl GcCompactionQueueItem { running, job_id: id.0, }), - GcCompactionQueueItem::UpdateL2Lsn(_) => None, - GcCompactionQueueItem::Notify(_) => None, + GcCompactionQueueItem::Notify(_, _) => None, } } } +#[derive(Default)] +struct GcCompactionGuardItems { + notify: Option>, + gc_guard: Option, + permit: Option, +} + struct GcCompactionQueueInner { running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, - notify: HashMap>, - gc_guards: HashMap, + guards: HashMap, last_id: GcCompactionJobId, } @@ -137,14 +147,18 @@ pub struct GcCompactionQueue { consumer_lock: tokio::sync::Mutex<()>, } +static CONCURRENT_GC_COMPACTION_TASKS: Lazy> = Lazy::new(|| { + // Only allow two timelines on one pageserver to run gc compaction at a time. + Arc::new(Semaphore::new(2)) +}); + impl GcCompactionQueue { pub fn new() -> Self { GcCompactionQueue { inner: std::sync::Mutex::new(GcCompactionQueueInner { running: None, queued: VecDeque::new(), - notify: HashMap::new(), - gc_guards: HashMap::new(), + guards: HashMap::new(), last_id: GcCompactionJobId(0), }), consumer_lock: tokio::sync::Mutex::new(()), @@ -154,8 +168,9 @@ impl GcCompactionQueue { pub fn cancel_scheduled(&self) { let mut guard = self.inner.lock().unwrap(); guard.queued.clear(); - guard.notify.clear(); - guard.gc_guards.clear(); + // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel + // API is only used for testing purposes, so we can drop everything here. + guard.guards.clear(); } /// Schedule a manual compaction job. @@ -166,29 +181,159 @@ impl GcCompactionQueue { ) -> GcCompactionJobId { let mut guard = self.inner.lock().unwrap(); let id = guard.next_id(); - guard - .queued - .push_back((id, GcCompactionQueueItem::Manual(options))); - if let Some(notify) = notify { - guard.notify.insert(id, notify); - } + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: false, + }, + )); + guard.guards.entry(id).or_default().notify = notify; info!("scheduled compaction job id={}", id); id } + /// Schedule an auto compaction job. + fn schedule_auto_compaction( + &self, + options: CompactOptions, + permit: OwnedSemaphorePermit, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: true, + }, + )); + guard.guards.entry(id).or_default().permit = Some(permit); + id + } + /// Trigger an auto compaction. - #[allow(dead_code)] - pub fn trigger_auto_compaction(&self, _: &Arc) {} + pub async fn trigger_auto_compaction(&self, timeline: &Arc) { + let GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } = timeline.get_gc_compaction_settings(); + if !gc_compaction_enabled { + return; + } + if self.remaining_jobs_num() > 0 { + // Only schedule auto compaction when the queue is empty + return; + } + if timeline.ancestor_timeline().is_some() { + // Do not trigger auto compaction for child timelines. We haven't tested + // it enough in staging yet. + return; + } + + let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { + // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure + // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` + // to ensure the fairness while avoid starving other tasks. + return; + }; + + let gc_compaction_state = timeline.get_gc_compaction_state(); + let l2_lsn = gc_compaction_state + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + + let layers = { + let guard = timeline.layers.read().await; + let layer_map = guard.layer_map().unwrap(); + layer_map.iter_historic_layers().collect_vec() + }; + let mut l2_size: u64 = 0; + let mut l1_size = 0; + let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn(); + for layer in layers { + if layer.lsn_range.start <= l2_lsn { + l2_size += layer.file_size(); + } else if layer.lsn_range.start <= gc_cutoff { + l1_size += layer.file_size(); + } + } + + fn trigger_compaction( + l1_size: u64, + l2_size: u64, + gc_compaction_initial_threshold_kb: u64, + gc_compaction_ratio_percent: u64, + ) -> bool { + const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB + if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + // Do not auto-trigger when physical size >= 150GB + return false; + } + // initial trigger + if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 { + info!( + "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}", + l1_size, gc_compaction_initial_threshold_kb + ); + return true; + } + // size ratio trigger + if l2_size == 0 { + return false; + } + if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) { + info!( + "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}", + l1_size, l2_size, gc_compaction_ratio_percent + ); + return true; + } + false + } + + if trigger_compaction( + l1_size, + l2_size, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + ) { + self.schedule_auto_compaction( + CompactOptions { + flags: { + let mut flags = EnumSet::new(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + flags + }, + sub_compaction: true, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction_max_job_size_mb: None, + }, + permit, + ); + info!( + "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } else { + info!( + "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } + } /// Notify the caller the job has finished and unblock GC. fn notify_and_unblock(&self, id: GcCompactionJobId) { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); - if let Some(blocking) = guard.gc_guards.remove(&id) { - drop(blocking) - } - if let Some(tx) = guard.notify.remove(&id) { - let _ = tx.send(()); + if let Some(items) = guard.guards.remove(&id) { + drop(items.gc_guard); + if let Some(tx) = items.notify { + let _ = tx.send(()); + } } } @@ -198,9 +343,12 @@ impl GcCompactionQueue { options: CompactOptions, timeline: &Arc, gc_block: &GcBlock, + auto: bool, ) -> Result<(), CompactionError> { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs: Vec = timeline + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + let jobs = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, @@ -223,6 +371,9 @@ impl GcCompactionQueue { let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); + // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. + // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN. + let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max(); for job in jobs { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. @@ -240,10 +391,16 @@ impl GcCompactionQueue { }; pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); } - pending_tasks.push(GcCompactionQueueItem::Notify(id)); + + if !auto { + pending_tasks.push(GcCompactionQueueItem::Notify(id, None)); + } else { + pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn)); + } + { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); @@ -254,7 +411,10 @@ impl GcCompactionQueue { guard.queued.push_front(item); } } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); + info!( + "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", + jobs_len + ); } Ok(()) } @@ -267,29 +427,48 @@ impl GcCompactionQueue { gc_block: &GcBlock, timeline: &Arc, ) -> Result { - let _one_op_at_a_time_guard = self.consumer_lock.lock().await; - let has_pending_tasks; - let (id, item) = { - let mut guard = self.inner.lock().unwrap(); - let Some((id, item)) = guard.queued.pop_front() else { - return Ok(CompactionOutcome::Done); - }; - guard.running = Some((id, item.clone())); - has_pending_tasks = !guard.queued.is_empty(); - (id, item) + let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { + return Err(CompactionError::AlreadyRunning( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", + )); + }; + let has_pending_tasks; + let Some((id, item)) = ({ + let mut guard = self.inner.lock().unwrap(); + if let Some((id, item)) = guard.queued.pop_front() { + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + Some((id, item)) + } else { + has_pending_tasks = false; + None + } + }) else { + self.trigger_auto_compaction(timeline).await; + // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we + // have not implemented preemption mechanism yet. We always want to yield it to more important + // tasks if there is one. + return Ok(CompactionOutcome::Done); }; - match item { - GcCompactionQueueItem::Manual(options) => { + GcCompactionQueueItem::MetaJob { options, auto } => { if !options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options); + warn!( + "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", + options + ); } else if options.sub_compaction { - self.handle_sub_compaction(id, options, timeline, gc_block) + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + self.handle_sub_compaction(id, options, timeline, gc_block, auto) .await?; } else { + // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn + // in this branch. let gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { @@ -301,20 +480,37 @@ impl GcCompactionQueue { }; { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); } let _ = timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); } } GcCompactionQueueItem::SubCompactionJob(options) => { + // TODO: error handling, clear the queue if any task fails? let _ = timeline.compact_with_options(cancel, options, ctx).await?; } - GcCompactionQueueItem::Notify(id) => { + GcCompactionQueueItem::Notify(id, l2_lsn) => { self.notify_and_unblock(id); - } - GcCompactionQueueItem::UpdateL2Lsn(_) => { - unreachable!() + if let Some(l2_lsn) = l2_lsn { + let current_l2_lsn = timeline + .get_gc_compaction_state() + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + if l2_lsn >= current_l2_lsn { + info!("l2_lsn updated to {}", l2_lsn); + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: l2_lsn, + }) + .map_err(CompactionError::Other)?; + } else { + warn!( + "l2_lsn updated to {} but it is less than the current l2_lsn {}", + l2_lsn, current_l2_lsn + ); + } + } } } { @@ -339,7 +535,6 @@ impl GcCompactionQueue { (guard.running.clone(), guard.queued.clone()) } - #[allow(dead_code)] pub fn remaining_jobs_num(&self) -> usize { let guard = self.inner.lock().unwrap(); guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } @@ -771,34 +966,29 @@ impl Timeline { self.upload_new_image_layers(image_layers)?; if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. - info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + info!( + "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)." + ); return Ok(CompactionOutcome::YieldForL0); } } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { - if let CompactionError::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)), - ) = err - { - critical!("could not compact, repartitioning keyspace failed: {err:?}"); - } else { - tracing::error!( - "could not compact, repartitioning keyspace failed: {err:?}" - ); - } - } + + // Suppress errors when cancelled. + Err(_) if self.cancel.is_cancelled() => {} + Err(err) if err.is_cancel() => {} + + // Alert on critical errors that indicate data corruption. + Err(err) if err.is_critical() => { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); } + + // Log other errors. No partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; - let partition_count = self.partitioning.read().0 .0.parts.len(); + let partition_count = self.partitioning.read().0.0.parts.len(); // 4. Shard ancestor compaction @@ -1007,7 +1197,7 @@ impl Timeline { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown) + return Err(CompactionError::ShuttingDown); } } @@ -1022,7 +1212,7 @@ impl Timeline { /// /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// that we know won't be needed for reads. - pub(super) async fn update_layer_visibility( + pub(crate) async fn update_layer_visibility( &self, ) -> Result<(), super::layer_manager::Shutdown> { let head_lsn = self.get_last_record_lsn(); @@ -1302,7 +1492,7 @@ impl Timeline { let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? - // min-heap (reserve space for one more element added before eviction) + // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; @@ -2165,8 +2355,14 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", - available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + return Err(anyhow!( + "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, + allocated_space, + all_layer_size, + remote_layer_size, + all_layer_size + remote_layer_size + )); } Ok(()) } @@ -2205,7 +2401,9 @@ impl Timeline { }; if compact_below_lsn == Lsn::INVALID { - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); return Ok(vec![]); } @@ -2214,7 +2412,7 @@ impl Timeline { let sub_compaction_max_job_size_mb = sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB); - let mut compact_jobs = Vec::new(); + let mut compact_jobs = Vec::::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); @@ -2301,16 +2499,25 @@ impl Timeline { } else { end }; - info!( - "splitting compaction job: {}..{}, estimated_size={}", - start, end, total_size - ); - compact_jobs.push(GcCompactJob { - dry_run: job.dry_run, - compact_key_range: start..end, - compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, - }); - current_start = Some(end); + if total_size == 0 && !compact_jobs.is_empty() { + info!( + "splitting compaction job: {}..{}, estimated_size={}, extending the previous job", + start, end, total_size + ); + compact_jobs.last_mut().unwrap().compact_key_range.end = end; + current_start = Some(end); + } else { + info!( + "splitting compaction job: {}..{}, estimated_size={}", + start, end, total_size + ); + compact_jobs.push(GcCompactJob { + dry_run: job.dry_run, + compact_key_range: start..end, + compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, + }); + current_start = Some(end); + } } } Ok(compact_jobs) @@ -2341,7 +2548,9 @@ impl Timeline { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); if sub_compaction { - info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); + info!( + "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); let jobs = self .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb) .await?; @@ -2393,7 +2602,13 @@ impl Timeline { let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); - info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end); + info!( + "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", + compact_key_range.start, + compact_key_range.end, + compact_lsn_range.start, + compact_lsn_range.end + ); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); @@ -2422,7 +2637,9 @@ impl Timeline { let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { if real_gc_cutoff == Lsn::INVALID { // If the gc_cutoff is not generated yet, we should not compact anything. - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); return Ok(()); } real_gc_cutoff @@ -2430,7 +2647,10 @@ impl Timeline { compact_lsn_range.end }; if gc_cutoff > real_gc_cutoff { - warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff); + warn!( + "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", + gc_cutoff, real_gc_cutoff + ); gc_cutoff = real_gc_cutoff; } gc_cutoff @@ -2454,7 +2674,10 @@ impl Timeline { .map(|desc| desc.get_lsn_range().end) .max() else { - info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff); + info!( + "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", + gc_cutoff + ); return Ok(()); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below @@ -2472,7 +2695,10 @@ impl Timeline { .map(|desc| desc.get_lsn_range().start) .min() else { - info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end); + info!( + "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", + compact_lsn_range.end + ); return Ok(()); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key @@ -2495,7 +2721,10 @@ impl Timeline { } } if selected_layers.is_empty() { - info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end); + info!( + "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", + gc_cutoff, compact_key_range.start, compact_key_range.end + ); return Ok(()); } retain_lsns_below_horizon.sort(); @@ -2577,7 +2806,10 @@ impl Timeline { .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + bail!( + "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", + err + ); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc @@ -2984,7 +3216,10 @@ impl Timeline { // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { - bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + bail!( + "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", + err + ); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only @@ -3049,7 +3284,8 @@ impl Timeline { if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { tracing::info!( "skipping delete {} because found same layer key at different generation {}", - layer, to + layer, + to ); } else { compact_from.push(layer.clone()); diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 841b2fa1c7..7cdc69e55f 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -1,26 +1,26 @@ -use std::{ - ops::{Deref, DerefMut}, - sync::Arc, -}; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; use anyhow::Context; -use pageserver_api::{models::TimelineState, shard::TenantShardId}; +use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use tokio::sync::OwnedMutexGuard; -use tracing::{error, info, info_span, instrument, Instrument}; -use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; +use tracing::{Instrument, error, info, info_span, instrument}; +use utils::id::TimelineId; +use utils::{crashsafe, fs_ext, pausable_failpoint}; -use crate::{ - config::PageServerConf, - task_mgr::{self, TaskKind}, - tenant::{ - metadata::TimelineMetadata, - remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, - CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, Timeline, TimelineOrOffloaded, - }, - virtual_file::MaybeFatalIo, +use crate::config::PageServerConf; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::{ + PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, }; +use crate::tenant::{ + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError, + Timeline, TimelineOrOffloaded, +}; +use crate::virtual_file::MaybeFatalIo; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. @@ -137,6 +137,11 @@ async fn remove_maybe_offloaded_timeline_from_tenant( timelines.remove(&timeline.timeline_id).expect( "timeline that we were deleting was concurrently removed from 'timelines' map", ); + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); } TimelineOrOffloaded::Offloaded(timeline) => { let offloaded_timeline = timelines_offloaded @@ -300,6 +305,7 @@ impl DeleteTimelineFlow { // Thus we need to skip the validation here. CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here + None, // doesn't matter what we put here ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index e0084d3eef..71bd196344 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,25 +1,28 @@ -use std::{collections::HashSet, sync::Arc}; +use std::collections::HashSet; +use std::sync::Arc; -use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::{ - remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{ - layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer, - }, - Tenant, - }, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::completion; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateError; + +use super::layer_manager::LayerManager; +use super::{FlushLayerError, Timeline}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::Tenant; +use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -64,9 +67,10 @@ impl Error { where F: Fn(anyhow::Error) -> Error, { + use remote_storage::TimeoutOrCancel; + use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; - use remote_storage::TimeoutOrCancel; if e.is::() || TimeoutOrCancel::caused_by_cancel(&e) @@ -360,14 +364,25 @@ pub(super) async fn prepare( let mut tasks = tokio::task::JoinSet::new(); let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); + let cancel_eval = CancellationToken::new(); for adopted in rest_of_historic { let limiter = limiter.clone(); let timeline = detached.clone(); + let cancel_eval = cancel_eval.clone(); tasks.spawn( async move { - let _permit = limiter.acquire().await; + let _permit = tokio::select! { + permit = limiter.acquire() => { + permit + } + // Wait for the cancellation here instead of letting the entire task be cancelled. + // Cancellations are racy in that they might leave layers on disk. + _ = cancel_eval.cancelled() => { + Err(Error::ShuttingDown)? + } + }; let (owned, did_hardlink) = remote_copy( &adopted, &timeline, @@ -383,7 +398,22 @@ pub(super) async fn prepare( ); } + fn delete_layers(timeline: &Timeline, layers: Vec) -> Result<(), Error> { + // We are deleting layers, so we must hold the gate + let _gate = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + { + layers.into_iter().for_each(|l: Layer| { + l.delete_on_drop(); + std::mem::drop(l); + }); + } + Ok(()) + } + let mut should_fsync = false; + let mut first_err = None; while let Some(res) = tasks.join_next().await { match res { Ok(Ok((owned, did_hardlink))) => { @@ -392,13 +422,24 @@ pub(super) async fn prepare( } new_layers.push(owned); } + + // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete. Ok(Err(failed)) => { - return Err(failed); + cancel_eval.cancel(); + first_err.get_or_insert(failed); + } + Err(je) => { + cancel_eval.cancel(); + first_err.get_or_insert(Error::Prepare(je.into())); } - Err(je) => return Err(Error::Prepare(je.into())), } } + if let Some(failed) = first_err { + delete_layers(detached, new_layers)?; + return Err(failed); + } + // fsync directory again if we hardlinked something if should_fsync { fsync_timeline_dir(detached, ctx).await; @@ -646,6 +687,11 @@ async fn remote_copy( let conf = adoptee.conf; let file_name = adopted.layer_desc().layer_name(); + // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below + let _gate = adoptee.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + // depending if Layer::keep_resident, do a hardlink let did_hardlink; let owned = if let Some(adopted_resident) = adopted.keep_resident().await { @@ -657,8 +703,32 @@ async fn remote_copy( &file_name, &metadata.generation, ); - std::fs::hard_link(adopted_path, &adoptee_path) - .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + + match std::fs::hard_link(adopted_path, &adoptee_path) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // In theory we should not get into this situation as we are doing cleanups of the layer file after errors. + // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch. + + // Double check that the file is orphan (probably from an earlier attempt), then delete it + let key = file_name.clone().into(); + if adoptee.layers.read().await.contains_key(&key) { + // We are supposed to filter out such cases before coming to this function + return Err(Error::Prepare(anyhow::anyhow!( + "layer file {file_name} already present and inside layer map" + ))); + } + tracing::info!("Deleting orphan layer file to make way for hard linking"); + // Delete orphan layer file and try again, to ensure this layer has a well understood source + std::fs::remove_file(adopted_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + } + Err(e) => { + return Err(Error::launder(e.into(), Error::Prepare)); + } + }; did_hardlink = true; Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() } else { @@ -666,12 +736,21 @@ async fn remote_copy( Layer::for_evicted(conf, adoptee, file_name, metadata) }; - let layer = adoptee + let layer = match adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await - .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare))?; + { + Ok(()) => owned, + Err(e) => { + { + // Clean up the layer so that on a retry we don't get errors that the file already exists + owned.delete_on_drop(); + std::mem::drop(owned); + } + return Err(Error::launder(e, Error::Prepare)); + } + }; Ok((layer, did_hardlink)) } @@ -780,7 +859,7 @@ pub(super) async fn detach_and_reparent( // TODO: make sure there are no `?` before tenant_reset from after a questionmark from // here. panic!( - "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" ); } }; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 77c33349e0..187d9f248e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -13,34 +13,27 @@ //! Items with parentheses are not (yet) touched by this task. //! //! See write-up on restart on-demand download spike: -use std::{ - collections::HashMap, - ops::ControlFlow, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; - -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::CollectKeySpaceError, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{ - size::CalculateSyntheticSizeError, - storage_layer::LayerVisibilityHint, - tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit}, - timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, - }, -}; - -use utils::{completion, sync::gate::GateGuard}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; +use utils::completion; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::storage_layer::LayerVisibilityHint; +use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; +use crate::tenant::timeline::EvictionError; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; #[derive(Default)] pub struct EvictionTaskTimelineState { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 5b39daaaf8..67fb89c433 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -202,18 +202,13 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::hash_map; -use std::collections::HashMap; -use std::sync::Arc; -use std::sync::Mutex; -use std::sync::Weak; +use std::collections::{HashMap, hash_map}; +use std::sync::{Arc, Mutex, Weak}; use pageserver_api::shard::ShardIdentity; -use tracing::instrument; -use tracing::trace; +use tracing::{instrument, trace}; use utils::id::TimelineId; -use utils::shard::ShardIndex; -use utils::shard::ShardNumber; +use utils::shard::{ShardIndex, ShardNumber}; use crate::tenant::mgr::ShardSelector; @@ -631,12 +626,10 @@ impl HandleInner { mod tests { use std::sync::Weak; - use pageserver_api::{ - key::{rel_block_to_key, Key, DBDIR_KEY}, - models::ShardParameters, - reltag::RelTag, - shard::ShardStripeSize, - }; + use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; + use pageserver_api::models::ShardParameters; + use pageserver_api::reltag::RelTag; + use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; use super::*; diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs index 0ba9753e85..27243ba378 100644 --- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -3,9 +3,10 @@ //! Provides utilities to spawn and abort a background task where the downloads happen. //! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. +use std::sync::{Arc, Mutex}; + use futures::StreamExt; use http_utils::error::ApiError; -use std::sync::{Arc, Mutex}; use tokio_util::sync::CancellationToken; use utils::sync::gate::Gate; diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6940179ae9..8b94a114d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,14 +1,14 @@ use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::{info, info_span, Instrument}; +use tracing::{Instrument, info, info_span}; use utils::lsn::Lsn; -use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; - use super::Timeline; +use crate::context::RequestContext; +use crate::tenant::metadata::TimelineMetadata; mod flow; mod importbucket_client; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 4388072606..3ef82b3658 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -28,52 +28,38 @@ //! An incomplete set of TODOs from the Hackathon: //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) +use std::collections::HashSet; +use std::ops::Range; use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; - use itertools::Itertools; -use pageserver_api::{ - key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, - reltag::RelTag, - shard::ShardIdentity, -}; -use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; -use tokio::task::JoinSet; -use tracing::{debug, info_span, instrument, Instrument}; - -use crate::{ - assert_u64_eq_usize::UsizeIsU64, - pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::{DbDirectory, RelDirectory}, - task_mgr::TaskKind, - tenant::storage_layer::{ImageLayerWriter, Layer}, -}; - -use pageserver_api::key::Key; use pageserver_api::key::{ - slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, - TWOPHASEDIR_KEY, + CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, + rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_size_to_key, }; -use pageserver_api::keyspace::singleton_range; -use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; -use pageserver_api::reltag::SlruKind; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::relfile_utils::parse_relfilename; +use postgres_ffi::{BLCKSZ, pg_constants}; +use remote_storage::RemotePath; +use tokio::task::JoinSet; +use tracing::{Instrument, debug, info_span, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use std::collections::HashSet; -use std::ops::Range; - -use super::{ - importbucket_client::{ControlFile, RemoteStorageWrapper}, - Timeline, +use super::Timeline; +use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; +use crate::assert_u64_eq_usize::UsizeIsU64; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::{ + DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; - -use remote_storage::RemotePath; +use crate::task_mgr::TaskKind; +use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index 68937e535d..a17a10d56b 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -1,4 +1,5 @@ -use std::{ops::Bound, sync::Arc}; +use std::ops::Bound; +use std::sync::Arc; use anyhow::Context; use bytes::Bytes; @@ -12,9 +13,9 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, instrument}; use utils::lsn::Lsn; -use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; - use super::{importbucket_format, index_part_format}; +use crate::assert_u64_eq_usize::U64IsUsize; +use crate::config::PageServerConf; pub async fn new( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index 310d97a6a9..ea7a41b25f 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -1,7 +1,6 @@ -use serde::{Deserialize, Serialize}; - #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Root { diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index c5210f9a30..7c7a4de2fc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -1,13 +1,12 @@ //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use reqwest::Method; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::error; -use crate::config::PageServerConf; -use reqwest::Method; - use super::importbucket_format::Spec; +use crate::config::PageServerConf; pub struct Client { base_url: String, diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 6634d07a0d..e952df0845 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -1,22 +1,16 @@ -use crate::{ - is_temporary, - tenant::{ - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - self, - index::{IndexPart, LayerFileMetadata}, - }, - storage_layer::LayerName, - }, -}; +use std::collections::{HashMap, hash_map}; +use std::str::FromStr; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use std::{ - collections::{hash_map, HashMap}, - str::FromStr, -}; use utils::lsn::Lsn; +use crate::is_temporary; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; +use crate::tenant::remote_timeline_client::{self}; +use crate::tenant::storage_layer::LayerName; + /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index cb7783d779..e552ea83de 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,27 +1,22 @@ -use anyhow::{bail, ensure, Context}; +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, bail, ensure}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; -use std::{collections::HashMap, sync::Arc}; use tracing::trace; -use utils::{ - id::TimelineId, - lsn::{AtomicLsn, Lsn}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - metrics::TimelineMetrics, - tenant::{ - layer_map::{BatchedUpdates, LayerMap}, - storage_layer::{ - AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey, - ResidentLayer, - }, - }, -}; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn}; use super::TimelineWriterState; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::metrics::TimelineMetrics; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::storage_layer::{ + AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, + PersistentLayerKey, ResidentLayer, +}; /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { @@ -118,6 +113,12 @@ impl LayerManager { self.layers().values().filter(|l| l.is_likely_resident()) } + pub(crate) fn visible_layers(&self) -> impl Iterator + '_ { + self.layers() + .values() + .filter(|l| l.visibility() == LayerVisibilityHint::Visible) + } + pub(crate) fn contains(&self, layer: &Layer) -> bool { self.contains_key(&layer.layer_desc().key()) } @@ -208,9 +209,7 @@ impl OpenLayerManager { trace!( "creating in-memory layer at {}/{} for record at {}", - timeline_id, - start_lsn, - lsn + timeline_id, start_lsn, lsn ); let new_layer = diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index f4a4eea54a..397037ca9f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -1,11 +1,10 @@ -use anyhow::Context; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use anyhow::Context; use once_cell::sync::OnceCell; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; - /// Internal structure to hold all data needed for logical size calculation. /// /// Calculation consists of two stages: diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 93e5a1100d..43ffaa6aab 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,11 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; +use super::delete::{DeletionGuard, delete_local_timeline_directory}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; -use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; +use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard}; use crate::tenant::{ DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, }; @@ -143,5 +143,12 @@ fn remove_timeline_from_tenant( .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + // Clear the compaction queue for this timeline + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); + Arc::strong_count(&timeline) } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 3074463384..f66c0ffa0f 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -1,18 +1,21 @@ -use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc}; +use std::collections::hash_map::Entry; +use std::fs; +use std::future::Future; +use std::sync::Arc; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; - -use crate::{ - context::RequestContext, - import_datadir, - span::debug_assert_current_span_has_tenant_and_timeline_id, - tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, -}; +use utils::fs_ext; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::RequestContext; +use crate::import_datadir; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or @@ -128,7 +131,7 @@ impl<'t> UninitializedTimeline<'t> { // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should // skip trying to delete the timeline directory too. anyhow::bail!( - "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ) } Entry::Vacant(v) => { diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f831f5e48a..4f80073cc3 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,17 +23,11 @@ mod connection_manager; mod walreceiver_connection; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::timeline::walreceiver::connection_manager::{ - connection_manager_loop_step, ConnectionManagerState, -}; - use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; use std::time::Duration; + use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; @@ -41,8 +35,13 @@ use tracing::*; use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; - use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::walreceiver::connection_manager::{ + ConnectionManagerState, connection_manager_loop_step, +}; #[derive(Clone)] pub struct WalReceiverConf { @@ -56,6 +55,7 @@ pub struct WalReceiverConf { pub auth_token: Option>, pub availability_zone: Option, pub ingest_batch_size: u64, + pub validate_wal_contiguity: bool, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 65f9d39078..df2663f6bb 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -9,45 +9,42 @@ //! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. -use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::Duration; -use super::{TaskStateUpdate, WalReceiverConf}; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; +use pageserver_api::models::TimelineState; +use postgres_connection::PgConnectionConfig; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, + TypedMessage, +}; +use storage_broker::{BrokerClientChannel, Code, Streaming}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, +}; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, +}; + +use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; +use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; -use anyhow::Context; -use chrono::{NaiveDateTime, Utc}; -use pageserver_api::models::TimelineState; - -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, - SubscribeByFilterRequest, TypeSubscription, TypedMessage, -}; -use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio_util::sync::CancellationToken; -use tracing::*; - -use postgres_connection::PgConnectionConfig; -use utils::backoff::{ - exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::postgres_client::{ - wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol, -}; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use super::{ - walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError, - TaskEvent, TaskHandle, -}; +use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; pub(crate) struct Cancelled; @@ -349,7 +346,9 @@ async fn subscribe_for_timeline_updates( Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. - info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); + info!( + "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}" + ); continue; } } @@ -512,11 +511,11 @@ impl ConnectionManagerState { fn spawn( &self, task: impl FnOnce( - tokio::sync::watch::Sender>, - CancellationToken, - ) -> Fut - + Send - + 'static, + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, ) -> TaskHandle where Fut: std::future::Future> + Send, @@ -537,6 +536,7 @@ impl ConnectionManagerState { let connect_timeout = self.conf.wal_connect_timeout; let ingest_batch_size = self.conf.ingest_batch_size; let protocol = self.conf.protocol; + let validate_wal_contiguity = self.conf.validate_wal_contiguity; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -558,6 +558,7 @@ impl ConnectionManagerState { ctx, node_id, ingest_batch_size, + validate_wal_contiguity, ) .await; @@ -878,8 +879,7 @@ impl ConnectionManagerState { discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { trace!( "New candidate has commit_lsn {}, higher than current_commit_lsn {}", - candidate_commit_lsn, - current_commit_lsn + candidate_commit_lsn, current_commit_lsn ); Some(NewCommittedWAL { lsn: candidate_commit_lsn, @@ -1046,7 +1046,9 @@ impl ConnectionManagerState { if !node_ids_to_remove.is_empty() { for node_id in node_ids_to_remove { - info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + info!( + "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" + ); self.wal_connection_retries.remove(&node_id); WALRECEIVER_CANDIDATES_REMOVED.inc(); } @@ -1117,11 +1119,12 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use super::*; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + fn dummy_broker_sk_timeline( commit_lsn: u64, safekeeper_connstr: &str, @@ -1563,6 +1566,7 @@ mod tests { auth_token: None, availability_zone: None, ingest_batch_size: 1, + validate_wal_contiguity: false, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 23db4f88d2..f41a9cfe82 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -1,46 +1,48 @@ //! Actual Postgres connection handler to stream WAL to the server. -use std::{ - error::Error, - pin::pin, - str::FromStr, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::error::Error; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; -use postgres_ffi::WAL_SEGMENT_SIZE; -use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError}; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use tokio::{select, sync::watch, time}; -use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn, Instrument}; -use wal_decoder::{ - models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}, - wire_format::FromWireFormat, -}; - -use super::TaskStateUpdate; -use crate::{ - context::RequestContext, - metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - pgdatadir_mapping::DatadirModification, - task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, - tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, - walingest::WalIngest, -}; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; -use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; -use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::sync::watch; +use tokio::{select, time}; +use tokio_postgres::error::SqlState; +use tokio_postgres::replication::ReplicationStream; +use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, trace, warn}; +use utils::critical; +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; +use utils::sync::gate::GateError; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::wire_format::FromWireFormat; + +use super::TaskStateUpdate; +use crate::context::RequestContext; +use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS}; +use crate::pgdatadir_mapping::DatadirModification; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::{ + Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id, +}; +use crate::walingest::WalIngest; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -64,7 +66,7 @@ pub(super) struct WalConnectionStatus { pub(super) enum WalReceiverError { /// An error of a type that does not indicate an issue, e.g. a connection closing - ExpectedSafekeeperError(postgres::Error), + ExpectedSafekeeperError(tokio_postgres::Error), /// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries /// the message part of the original postgres error SuccessfulCompletion(String), @@ -120,6 +122,7 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, safekeeper_node: NodeId, ingest_batch_size: u64, + validate_wal_contiguity: bool, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -142,13 +145,15 @@ pub(super) async fn handle_walreceiver_connection( let mut config = wal_source_connconf.to_tokio_postgres_config(); config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); - match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { + match time::timeout(connect_timeout, config.connect(tokio_postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. - info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open"); + info!( + "Timed out while waiting {connect_timeout:?} for walreceiver connection to open" + ); return Ok(()); } } @@ -165,7 +170,9 @@ pub(super) async fn handle_walreceiver_connection( node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped right after connection init, aborting the connection: {e}" + ); return Ok(()); } @@ -226,7 +233,9 @@ pub(super) async fn handle_walreceiver_connection( connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}" + ); return Ok(()); } @@ -253,7 +262,9 @@ pub(super) async fn handle_walreceiver_connection( // to the safekeepers. startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); - info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); + info!( + "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..." + ); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); @@ -274,6 +285,7 @@ pub(super) async fn handle_walreceiver_connection( } => Some((format, compression)), }; + let mut expected_wal_start = startpoint; while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { @@ -340,13 +352,49 @@ pub(super) async fn handle_walreceiver_connection( ) })?; + // Guard against WAL gaps. If the start LSN of the PG WAL section + // from which the interpreted records were extracted, doesn't match + // the end of the previous batch (or the starting point for the first batch), + // then kill this WAL receiver connection and start a new one. + if validate_wal_contiguity { + if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn { + match raw_wal_start_lsn.cmp(&expected_wal_start) { + std::cmp::Ordering::Greater => { + let msg = format!( + "Gap in streamed WAL: [{}, {})", + expected_wal_start, raw_wal_start_lsn + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + std::cmp::Ordering::Less => { + // Other shards are reading WAL behind us. + // This is valid, but check that we received records + // that we haven't seen before. + if let Some(first_rec) = batch.records.first() { + if first_rec.next_record_lsn < last_rec_lsn { + let msg = format!( + "Received record with next_record_lsn multiple times ({} < {})", + first_rec.next_record_lsn, expected_wal_start + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + } + } + std::cmp::Ordering::Equal => {} + } + } + } + let InterpretedWalRecords { records, next_record_lsn, + raw_wal_start_lsn: _, } = batch; tracing::debug!( - "Received WAL up to {} with next_record_lsn={:?}", + "Received WAL up to {} with next_record_lsn={}", streaming_lsn, next_record_lsn ); @@ -423,12 +471,11 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn { - Some(lsn) if lsn > modification.get_lsn() => { - modification.set_lsn(lsn).unwrap(); - true - } - _ => false, + let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() { + modification.set_lsn(next_record_lsn).unwrap(); + true + } else { + false }; if uncommitted_records > 0 || needs_last_record_lsn_advance { @@ -446,9 +493,8 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); - if let Some(lsn) = next_record_lsn { - last_rec_lsn = lsn; - } + last_rec_lsn = next_record_lsn; + expected_wal_start = streaming_lsn; Some(streaming_lsn) } @@ -590,7 +636,9 @@ pub(super) async fn handle_walreceiver_connection( let timestamp = keepalive.timestamp(); let reply_requested = keepalive.reply() != 0; - trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + trace!( + "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})" + ); if reply_requested { Some(last_rec_lsn) diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index d302205ffe..d5dc9666ce 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,21 +1,18 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Debug; -use std::sync::atomic::AtomicU32; use std::sync::Arc; - -use super::remote_timeline_client::is_same_remote_layer_path; -use super::storage_layer::AsLayerDesc as _; -use super::storage_layer::LayerName; -use super::storage_layer::ResidentLayer; -use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use utils::generation::Generation; -use utils::lsn::{AtomicLsn, Lsn}; +use std::sync::atomic::AtomicU32; use chrono::NaiveDateTime; use once_cell::sync::Lazy; use tracing::info; +use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; /// Kill switch for upload queue reordering in case it causes problems. /// TODO: remove this once we have confidence in it. @@ -225,7 +222,7 @@ impl UploadQueueInitialized { // most one of them can be an index upload (enforced by can_bypass). .scan(&self.clean.0, |next_active_index, op| { let active_index = *next_active_index; - if let UploadOp::UploadMetadata { ref uploaded } = op { + if let UploadOp::UploadMetadata { uploaded } = op { *next_active_index = uploaded; // stash index for next operation after this } Some((op, active_index)) @@ -562,16 +559,18 @@ impl UploadOp { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; - use crate::tenant::storage_layer::layer::local_layer_path; - use crate::tenant::storage_layer::Layer; - use crate::tenant::Timeline; - use crate::DEFAULT_PG_VERSION; - use itertools::Itertools as _; use std::str::FromStr as _; + + use itertools::Itertools as _; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::Timeline; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::Layer; + use crate::tenant::storage_layer::layer::local_layer_path; + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. #[track_caller] fn assert_same_op(a: &UploadOp, b: &UploadOp) { @@ -690,10 +689,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let (barrier, _) = tokio::sync::watch::channel(()); // Enqueue non-conflicting upload, delete, and index before and after a barrier. @@ -757,10 +768,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of deletes, some with conflicting names. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::Delete(Delete { @@ -802,9 +825,21 @@ mod tests { let tli = make_timeline(); // Enqueue three versions of the same layer, with different file sizes. - let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1); - let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2); - let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3); + let layer0a = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 1, + ); + let layer0b = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 2, + ); + let layer0c = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 3, + ); let ops = [ UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), @@ -836,8 +871,14 @@ mod tests { // Enqueue two layer uploads, with a delete of both layers in between them. These should be // scheduled one at a time, since deletes can't bypass uploads and vice versa. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -878,10 +919,22 @@ mod tests { // // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue // and run immediately. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -916,9 +969,18 @@ mod tests { let tli = make_timeline(); // Enqueue three different layer uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -981,11 +1043,20 @@ mod tests { // Enqueue three uploads of the current empty index. let index = Box::new(queue.clean.0.clone()); - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index0 = index_with(&index, &layer0); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index1 = index_with(&index0, &layer1); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index2 = index_with(&index1, &layer2); let ops = [ @@ -1045,7 +1116,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index_upload = index_with(&queue.clean.0, &layer); // Remove the layer reference in a new index, then delete the layer. @@ -1090,7 +1164,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Upload the layer. Then dereference the layer, and upload/reference it again. let index_upload = index_with(&queue.clean.0, &layer); @@ -1138,10 +1215,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Enqueue non-conflicting upload, delete, and index before and after a shutdown. let ops = [ @@ -1197,10 +1286,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 47fb4a276b..dcf17a376c 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -27,8 +27,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] @@ -139,7 +138,10 @@ impl VectoredBlob { bits => { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end), + format!( + "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", + self.meta.key, self.meta.lsn, self.start, self.end + ), ); Err(error) } @@ -677,13 +679,12 @@ impl StreamingVectoredReadPlanner { mod tests { use anyhow::Error; + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; + use super::*; use crate::context::DownloadBehavior; use crate::page_cache::PAGE_SZ; use crate::task_mgr::TaskKind; - use super::super::blob_io::tests::{random_array, write_maybe_compressed}; - use super::*; - fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; assert_eq!(read.start % ALIGN, 0); diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 093a944777..29d1a31aaf 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -3,13 +3,15 @@ //! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the //! truth. -use anyhow::Context; use std::path::Path; + +use anyhow::Context; +use pageserver_api::models::PageserverUtilization; use utils::serde_percent::Percent; -use pageserver_api::models::PageserverUtilization; - -use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; +use crate::config::PageServerConf; +use crate::metrics::NODE_UTILIZATION_SCORE; +use crate::tenant::mgr::TenantManager; pub(crate) fn regenerate( conf: &PageServerConf, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c966ad813f..b47aecf8a6 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,11 +11,13 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::context::RequestContext; -use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; +use std::fs::File; +use std::io::{Error, ErrorKind, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; -use crate::page_cache::{PageWriteGuard, PAGE_SZ}; -use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; @@ -23,31 +25,30 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; +pub use pageserver_api::models::virtual_file as api; use pageserver_api::shard::TenantShardId; -use std::fs::File; -use std::io::{Error, ErrorKind, Seek, SeekFrom}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; - -use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; -pub use pageserver_api::models::virtual_file as api; +use crate::context::RequestContext; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation}; +use crate::page_cache::{PAGE_SZ, PageWriteGuard}; +use crate::tenant::TENANTS_SEGMENT_NAME; pub(crate) mod io_engine; -pub use io_engine::feature_test as io_engine_feature_test; -pub use io_engine::io_engine_for_bench; -pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +pub use io_engine::{ + FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, + io_engine_for_bench, +}; mod metadata; mod open_options; -use self::owned_buffers_io::write::OwnedAsyncWriter; pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +use self::owned_buffers_io::write::OwnedAsyncWriter; + pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! @@ -1078,7 +1079,8 @@ where #[cfg(test)] mod test_read_exact_at_impl { - use std::{collections::VecDeque, sync::Arc}; + use std::collections::VecDeque; + use std::sync::Arc; use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; @@ -1424,19 +1426,19 @@ static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); #[cfg(test)] mod tests { - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; - - use super::*; - use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; - use rand::seq::SliceRandom; - use rand::thread_rng; - use rand::Rng; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; + use rand::seq::SliceRandom; + use rand::{Rng, thread_rng}; + + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + enum MaybeVirtualFile { VirtualFile(VirtualFile), File(File), diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index ccde90ee1a..758dd6e377 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -80,7 +80,9 @@ pub(crate) fn get() -> IoEngine { Ok(v) => match v.parse::() { Ok(engine_kind) => engine_kind, Err(e) => { - panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + panic!( + "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}" + ) } }, Err(std::env::VarError::NotPresent) => { @@ -107,15 +109,12 @@ pub(crate) fn get() -> IoEngine { } } -use std::{ - os::unix::prelude::FileExt, - sync::atomic::{AtomicU8, Ordering}, -}; +use std::os::unix::prelude::FileExt; +use std::sync::atomic::{AtomicU8, Ordering}; -use super::{ - owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, - FileGuard, Metadata, -}; +use super::owned_buffers_io::io_buf_ext::FullSlice; +use super::owned_buffers_io::slice::SliceMutExt; +use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs index c67215492f..ad17405b64 100644 --- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -5,18 +5,16 @@ //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. //! See for more details. -use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; - -use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use tokio_epoll_uring::{System, SystemHandle}; - -use crate::virtual_file::on_fatal_io_error; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, error, info, info_span, warn}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; +use crate::virtual_file::on_fatal_io_error; #[derive(Clone)] struct ThreadLocalState(Arc); @@ -194,7 +192,7 @@ impl std::ops::Deref for Handle { fn deref(&self) -> &Self::Target { self.0 - .0 + .0 .cell .get() .expect("must be already initialized when using this") diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7f951270d1..e188b8649b 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,7 +1,9 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; +use std::os::fd::OwnedFd; +use std::path::Path; + use super::io_engine::IoEngine; -use std::{os::fd::OwnedFd, path::Path}; #[derive(Debug, Clone)] pub enum OpenOptions { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index a5c26cd746..090d2ece85 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -1,9 +1,9 @@ -use std::{ - ops::{Deref, Range, RangeBounds}, - sync::Arc, -}; +use std::ops::{Deref, Range, RangeBounds}; +use std::sync::Arc; -use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; +use super::alignment::Alignment; +use super::raw::RawAlignedBuffer; +use super::{AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index d2f5e206bb..df5c911e50 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,13 +1,9 @@ -use std::{ - mem::MaybeUninit, - ops::{Deref, DerefMut}, -}; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; -use super::{ - alignment::{Alignment, ConstAlign}, - buffer::AlignedBuffer, - raw::RawAlignedBuffer, -}; +use super::alignment::{Alignment, ConstAlign}; +use super::buffer::AlignedBuffer; +use super::raw::RawAlignedBuffer; /// A mutable aligned buffer type. #[derive(Debug)] @@ -75,7 +71,8 @@ impl AlignedBufferMut { /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { - self.raw.set_len(new_len) + // SAFETY: the caller is unsafe + unsafe { self.raw.set_len(new_len) } } #[inline] @@ -222,8 +219,10 @@ unsafe impl bytes::BufMut for AlignedBufferMut { panic_advance(cnt, remaining); } - // Addition will not overflow since the sum is at most the capacity. - self.set_len(len + cnt); + // SAFETY: Addition will not overflow since the sum is at most the capacity. + unsafe { + self.set_len(len + cnt); + } } #[inline] @@ -275,7 +274,10 @@ unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { unsafe fn set_init(&mut self, init_len: usize) { if self.len() < init_len { - self.set_len(init_len); + // SAFETY: caller function is unsafe + unsafe { + self.set_len(init_len); + } } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 6c26dec0db..97a6c4049a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -1,9 +1,7 @@ use core::slice; -use std::{ - alloc::{self, Layout}, - cmp, - mem::ManuallyDrop, -}; +use std::alloc::{self, Layout}; +use std::cmp; +use std::mem::ManuallyDrop; use super::alignment::{Alignment, ConstAlign}; diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 525f447b6d..4c671c2652 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,11 +1,12 @@ //! See [`FullSlice`]. -use crate::virtual_file::{IoBuffer, IoBufferMut}; -use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; + +use bytes::{Bytes, BytesMut}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use super::write::CheapCloneForRead; +use crate::virtual_file::{IoBuffer, IoBufferMut}; /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index 6100593663..9f4a05dd57 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -1,7 +1,4 @@ -use tokio_epoll_uring::BoundedBuf; -use tokio_epoll_uring::BoundedBufMut; -use tokio_epoll_uring::IoBufMut; -use tokio_epoll_uring::Slice; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice}; pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. @@ -35,10 +32,11 @@ where mod tests { use std::io::Read; - use super::*; use bytes::Buf; use tokio_epoll_uring::Slice; + use super::*; + #[test] fn test_slice_full_zeroed() { let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 7299d83703..861ca3aa2a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,20 +1,14 @@ mod flush; use std::sync::Arc; +pub(crate) use flush::FlushControl; use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::{ - context::RequestContext, - virtual_file::{IoBuffer, IoBufferMut}, -}; - -use super::{ - io_buf_aligned::IoBufAligned, - io_buf_ext::{FullSlice, IoBufExt}, -}; - -pub(crate) use flush::FlushControl; +use super::io_buf_aligned::IoBufAligned; +use super::io_buf_ext::{FullSlice, IoBufExt}; +use crate::context::RequestContext; +use crate::virtual_file::{IoBuffer, IoBufferMut}; pub(crate) trait CheapCloneForRead { /// Returns a cheap clone of the buffer. diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 9ce8b311bb..46309d4011 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -2,12 +2,10 @@ use std::sync::Arc; use utils::sync::duplex; -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, -}; - use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; +use crate::context::RequestContext; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; /// A handle to the flush task. pub struct FlushHandle { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 04edb3e3f4..18df065f76 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,39 +22,35 @@ //! bespoken Rust code. use std::collections::HashMap; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use std::time::Instant; -use std::time::SystemTime; +use std::sync::{Arc, OnceLock}; +use std::time::{Duration, Instant, SystemTime}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use tracing::*; - -use crate::context::RequestContext; -use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::{DatadirModification, Version}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; -use crate::ZERO_PAGE; use pageserver_api::key::rel_block_to_key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::fsm_logical_to_physical; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::walrecord::*; -use postgres_ffi::TransactionId; -use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; +use postgres_ffi::{ + TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, + fsm_logical_to_physical, pg_constants, +}; +use tracing::*; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; use wal_decoder::models::*; +use crate::ZERO_PAGE; +use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; +use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{PageReconstructError, Timeline}; + enum_pgversion! {CheckPoint, pgv::CheckPoint} impl CheckPoint { @@ -302,7 +298,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + bail!( + "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" + ); } epoch -= 1; } @@ -796,9 +794,7 @@ impl WalIngest { // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", - xl_xid, - parsed.xid, - lsn, + xl_xid, parsed.xid, lsn, ); let xid: u64 = if modification.tline.pg_version >= 17 { @@ -1130,16 +1126,14 @@ impl WalIngest { let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - cp.oldestXid + xlog_checkpoint.oldestXid, cp.oldestXid ); if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { cp.oldestXid = xlog_checkpoint.oldestXid; } trace!( "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - cp.oldestActiveXid + xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid ); // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, @@ -1180,6 +1174,50 @@ impl WalIngest { } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } + // NB: We abuse the Checkpoint.redo field: + // + // - In PostgreSQL, the Checkpoint struct doesn't store the information + // of whether this is an online checkpoint or a shutdown checkpoint. It's + // stored in the XLOG info field of the WAL record, shutdown checkpoints + // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use + // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers + // in the pageserver, however. + // + // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the + // checkpoint record, if it's a shutdown checkpoint. But when we are + // starting from a shutdown checkpoint, the basebackup LSN is the *end* + // of the shutdown checkpoint WAL record. That makes it difficult to + // correctly detect whether we're starting from a shutdown record or + // not. + // + // To address both of those issues, we store 0 in the redo field if it's + // an online checkpoint record, and the record's *end* LSN if it's a + // shutdown checkpoint. We don't need the original redo pointer in neon, + // because we don't perform WAL replay at startup anyway, so we can get + // away with abusing the redo field like this. + // + // XXX: Ideally, we would persist the extra information in a more + // explicit format, rather than repurpose the fields of the Postgres + // struct like this. However, we already have persisted data like this, + // so we need to maintain backwards compatibility. + // + // NB: We didn't originally have this convention, so there are still old + // persisted records that didn't do this. Before, we didn't update the + // persisted redo field at all. That means that old records have a bogus + // redo pointer that points to some old value, from the checkpoint record + // that was originally imported from the data directory. If it was a + // project created in Neon, that means it points to the first checkpoint + // after initdb. That's OK for our purposes: all such old checkpoints are + // treated as old online checkpoints when the basebackup is created. + cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + // Store the *end* LSN of the checkpoint record. Or to be precise, + // the start LSN of the *next* record, i.e. if the record ends + // exactly at page boundary, the redo LSN points to just after the + // page header on the next page. + lsn.into() + } else { + Lsn::INVALID.into() + }; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -1324,8 +1362,9 @@ impl WalIngest { // with zero pages. Logging is rate limited per pg version to // avoid skewing. if gap_blocks_filled > 0 { - use once_cell::sync::Lazy; use std::sync::Mutex; + + use once_cell::sync::Lazy; use utils::rate_limit::RateLimit; struct RateLimitPerPgVersion { @@ -1431,10 +1470,7 @@ impl WalIngest { if new_nblocks > old_nblocks { trace!( "extending SLRU {:?} seg {} from {} to {} blocks", - kind, - segno, - old_nblocks, - new_nblocks + kind, segno, old_nblocks, new_nblocks ); modification.put_slru_extend(kind, segno, new_nblocks)?; @@ -1473,13 +1509,13 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::*; - use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::storage_layer::IoConcurrency; use postgres_ffi::RELSEG_SIZE; + use super::*; use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path}; + use crate::tenant::storage_layer::IoConcurrency; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1562,10 +1598,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) @@ -1953,10 +1991,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline @@ -2186,9 +2226,10 @@ mod tests { /// without waiting for unrelated steps. #[tokio::test] async fn test_ingest_real_wal() { - use crate::tenant::harness::*; - use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::waldecoder::WalStreamDecoder; + + use crate::tenant::harness::*; // Define test data path and constants. // diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 027a6eb7d7..22d8d83811 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -24,26 +24,27 @@ mod process; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; -use crate::config::PageServerConf; -use crate::metrics::{ - WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; -use std::future::Future; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, +}; + /// The real implementation that uses a Postgres process to /// perform WAL replay. /// @@ -547,15 +548,18 @@ impl PostgresRedoManager { #[cfg(test)] mod tests { - use super::PostgresRedoManager; - use crate::config::PageServerConf; + use std::str::FromStr; + use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; - use std::str::FromStr; use tracing::Instrument; - use utils::{id::TenantId, lsn::Lsn}; + use utils::id::TenantId; + use utils::lsn::Lsn; + + use super::PostgresRedoManager; + use crate::config::PageServerConf; #[tokio::test] async fn test_ping() { diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index d62e325310..61ae1eb970 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -4,13 +4,12 @@ use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, pg_constants}; use tracing::*; use utils::lsn::Lsn; diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bf30b92ea5..5a9fc63e63 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -2,28 +2,28 @@ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - page_cache::PAGE_SZ, - span::debug_assert_current_span_has_tenant_id, -}; +use std::collections::VecDeque; +use std::process::{Command, Stdio}; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::time::Duration; + use anyhow::Context; use bytes::Bytes; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use pageserver_api::reltag::RelTag; +use pageserver_api::shard::TenantShardId; use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; +use tracing::{Instrument, debug, error, instrument}; +use utils::lsn::Lsn; +use utils::poison::Poison; + +use self::no_leak_child::NoLeakChild; +use crate::config::PageServerConf; +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause}; +use crate::page_cache::PAGE_SZ; +use crate::span::debug_assert_current_span_has_tenant_id; pub struct WalRedoProcess { #[allow(dead_code)] diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs index 1a0d7039df..9939fc4b36 100644 --- a/pageserver/src/walredo/process/no_leak_child.rs +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -1,19 +1,11 @@ -use tracing::instrument; -use tracing::{error, info}; - -use crate::metrics::WalRedoKillCause; -use crate::metrics::WAL_REDO_PROCESS_COUNTERS; - use std::io; -use std::process::Command; - -use std::ops::DerefMut; - -use std::ops::Deref; - -use std::process::Child; +use std::ops::{Deref, DerefMut}; +use std::process::{Child, Command}; use pageserver_api::shard::TenantShardId; +use tracing::{error, info, instrument}; + +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause}; /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index a61dc9f4c6..f6a577abfc 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -22,6 +22,7 @@ #include "neon_pgversioncompat.h" #include "access/parallel.h" +#include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" @@ -40,12 +41,16 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + #include "hll.h" #include "bitmap.h" #include "neon.h" #include "neon_perf_counters.h" -#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) /* * Local file cache is used to temporary store relations pages in local file system. @@ -93,7 +98,23 @@ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) -#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) + +/* + * Blocks are read or written to LFC file outside LFC critical section. + * To synchronize access to such block, writer set state of such block to PENDING. + * If some other backend (read or writer) see PENDING status, it change it to REQUESTED and start + * waiting until status is changed on conditional variable. + * When writer completes is operation, it checks if status is REQUESTED and if so, broadcast conditional variable, + * waking up all backend waiting for access to this block. + */ +typedef enum FileCacheBlockState +{ + UNAVAILABLE, /* block is not present in cache */ + AVAILABLE, /* block can be used */ + PENDING, /* block is loaded */ + REQUESTED /* some other backend is waiting for block to be loaded */ +} FileCacheBlockState; + typedef struct FileCacheEntry { @@ -101,10 +122,16 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[CHUNK_BITMAP_SIZE]; + uint32 state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */ dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; +#define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3) +#define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2)) + +#define N_COND_VARS 64 +#define CV_WAIT_TIMEOUT 10 + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -118,18 +145,24 @@ typedef struct FileCacheControl uint64 writes; /* number of writes issued */ uint64 time_read; /* time spent reading (us) */ uint64 time_write; /* time spent writing (us) */ + uint64 resizes; /* number of LFC resizes */ + uint64 evicted_pages; /* number of evicted pages */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ + ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ } FileCacheControl; +bool lfc_store_prefetch_result; + static HTAB *lfc_hash; -static int lfc_desc = 0; +static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; static char *lfc_path; +static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 @@ -138,6 +171,20 @@ static shmem_request_hook_type prev_shmem_request_hook; #define LFC_ENABLED() (lfc_ctl->limit != 0) +/* + * Close LFC file if opened. + * All backends should close their LFC files once LFC is disabled. + */ +static void +lfc_close_file(void) +{ + if (lfc_desc >= 0) + { + close(lfc_desc); + lfc_desc = -1; + } +} + /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. @@ -145,20 +192,16 @@ static shmem_request_hook_type prev_shmem_request_hook; * All cache content should be invalidated to avoid reading of stale or corrupted data */ static void -lfc_disable(char const *op) +lfc_switch_off(void) { int fd; - elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); - - /* Invalidate hash */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - if (LFC_ENABLED()) { HASH_SEQ_STATUS status; FileCacheEntry *entry; + /* Invalidate hash */ hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { @@ -171,41 +214,33 @@ lfc_disable(char const *op) dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); - if (lfc_desc > 0) - { - int rc; + /* + * We need to use unlink to to avoid races in LFC write, because it is not + * protected by lock + */ + unlink(lfc_path); - /* - * If the reason of error is ENOSPC, then truncation of file may - * help to reclaim some space - */ - pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE); - rc = ftruncate(lfc_desc, 0); - pgstat_report_wait_end(); + fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); + if (fd < 0) + elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path); + else + close(fd); - if (rc < 0) - elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); - } + /* Wakeup waiting backends */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableBroadcast(&lfc_ctl->cv[i]); } + lfc_close_file(); +} - /* - * We need to use unlink to to avoid races in LFC write, because it is not - * protectedby - */ - unlink(lfc_path); - - fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); - if (fd < 0) - elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path); - else - close(fd); +static void +lfc_disable(char const *op) +{ + elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_switch_off(); LWLockRelease(lfc_lock); - - if (lfc_desc > 0) - close(lfc_desc); - - lfc_desc = -1; } /* @@ -217,13 +252,20 @@ lfc_maybe_disabled(void) return !lfc_ctl || !LFC_ENABLED(); } +/* + * Open LFC file if not opened yet or generation is changed. + * Should be called under LFC lock. + */ static bool lfc_ensure_opened(void) { - bool enabled = !lfc_maybe_disabled(); - + if (lfc_generation != lfc_ctl->generation) + { + lfc_close_file(); + lfc_generation = lfc_ctl->generation; + } /* Open cache file if not done yet */ - if (lfc_desc <= 0 && enabled) + if (lfc_desc < 0) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR); @@ -233,7 +275,7 @@ lfc_ensure_opened(void) return false; } } - return enabled; + return true; } static void @@ -267,14 +309,7 @@ lfc_shmem_startup(void) n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); - lfc_ctl->generation = 0; - lfc_ctl->size = 0; - lfc_ctl->used = 0; - lfc_ctl->hits = 0; - lfc_ctl->misses = 0; - lfc_ctl->writes = 0; - lfc_ctl->time_read = 0; - lfc_ctl->time_write = 0; + memset(lfc_ctl, 0, sizeof(FileCacheControl)); dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -285,7 +320,7 @@ lfc_shmem_startup(void) fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) { - elog(WARNING, "Failed to create local file cache %s: %m", lfc_path); + elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path); lfc_ctl->limit = 0; } else @@ -293,6 +328,11 @@ lfc_shmem_startup(void) close(fd); lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit); } + + /* Initialize turnstile of condition variables */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableInit(&lfc_ctl->cv[i]); + } LWLockRelease(AddinShmemInitLock); } @@ -327,7 +367,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source) { if (*newval > lfc_max_size) { - elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); + elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); return false; } return true; @@ -338,14 +378,31 @@ lfc_change_limit_hook(int newval, void *extra) { uint32 new_size = SIZE_MB_TO_CHUNKS(newval); - if (!is_normal_backend()) - return; - - if (!lfc_ensure_opened()) + if (!lfc_ctl || !is_normal_backend()) return; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + /* Open LFC file only if LFC was enabled or we are going to reenable it */ + if (newval == 0 && !LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + /* File should be reopened if LFC is reenabled */ + lfc_close_file(); + return; + } + + if (!lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->limit != new_size) + { + lfc_ctl->resizes += 1; + } + while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* @@ -367,7 +424,9 @@ lfc_change_limit_hook(int newval, void *extra) /* We remove the old entry, and re-enter a hole to the hash table */ for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); @@ -383,10 +442,11 @@ lfc_change_limit_hook(int newval, void *extra) lfc_ctl->used -= 1; } - lfc_ctl->limit = new_size; - if (new_size == 0) { - lfc_ctl->generation += 1; - } + if (new_size == 0) + lfc_switch_off(); + else + lfc_ctl->limit = new_size; + neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); @@ -403,6 +463,17 @@ lfc_init(void) neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries"); + DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc", + "Immediately store received prefetch result in LFC", + NULL, + &lfc_store_prefetch_result, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -480,7 +551,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) if (LFC_ENABLED()) { entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0; + found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE; } LWLockRelease(lfc_lock); return found; @@ -529,8 +600,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) { - if ((entry->bitmap[chunk_offs >> 5] & - ((uint32)1 << (chunk_offs & 31))) != 0) + if (GET_STATE(entry, chunk_offs) != UNAVAILABLE) { BITMAP_SET(bitmap, i); found++; @@ -541,7 +611,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { i += this_chunk; } - /* * Break out of the iteration before doing expensive stuff for * a next iteration @@ -577,87 +646,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return found; } -/* - * Evict a page (if present) from the local file cache - */ -void -lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) -{ - BufferTag tag; - FileCacheEntry *entry; - bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); - uint32 hash; - - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return; - - CopyNRelFileInfoToBufTag(tag, rinfo); - tag.forkNum = forkNum; - tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); - - CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } - - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); - - if (!found) - { - /* nothing to do */ - LWLockRelease(lfc_lock); - return; - } - - /* remove the page from the cache */ - entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1))); - - if (entry->access_count == 0) - { - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) - { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk - */ - if (!has_remaining_pages) - { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); - } - } - } - - /* - * Done: apart from empty chunks, we don't move chunks in the LRU when - * they're empty because eviction isn't usage. - */ - - LWLockRelease(lfc_lock); -} - /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in @@ -685,17 +673,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return 0; - - if (!lfc_ensure_opened()) - return 0; + return -1; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -712,22 +697,35 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; + int n_blocks_to_read = 0; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { + n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0); iov[i].iov_base = buffers[buf_offset + i]; iov[i].iov_len = BLCKSZ; + BITMAP_CLR(mask, buf_offset + i); + } + if (n_blocks_to_read == 0) + { + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + continue; } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* We can return the blocks we've read before LFC got disabled; * assuming we read any. */ - if (!LFC_ENABLED()) + if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return blocks_read; @@ -763,15 +761,32 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - LWLockRelease(lfc_lock); - for (int i = 0; i < blocks_in_chunk; i++) { - /* - * If the page is valid, we consider it "read". - * All other pages will be fetched separately by the next cache - */ - if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32))) + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + if (state == AVAILABLE) { BITMAP_SET(mask, buf_offset + i); iteration_hits++; @@ -779,6 +794,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else iteration_misses++; } + LWLockRelease(lfc_lock); Assert(iteration_hits + iteration_misses > 0); @@ -820,6 +836,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else { /* generation mismatch, assume error condition */ + lfc_close_file(); LWLockRelease(lfc_lock); return -1; } @@ -835,6 +852,249 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return blocks_read; } +/* + * Initialize new LFC hash entry, perform eviction if needed. + * Returns false if there are no unpinned entries and chunk can not be added. + */ +static bool +lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) +{ + /*----------- + * If the chunk wasn't already in the LFC then we have these + * options, in order of preference: + * + * Unless there is no space available, we can: + * 1. Use an entry from the `holes` list, and + * 2. Create a new entry. + * We can always, regardless of space in the LFC: + * 3. evict an entry from LRU, and + * 4. ignore the write operation (the least favorite option) + */ + if (lfc_ctl->used < lfc_ctl->limit) + { + if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool hole_found; + + hash_search_with_hash_value(lfc_hash, &hole->key, + hole->hash, HASH_REMOVE, &hole_found); + CriticalAssert(hole_found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++;/* allocate new chunk at end + * of file */ + } + } + /* + * We've already used up all allocated LFC entries. + * + * If we can clear an entry from the LRU, do that. + * If we can't (e.g. because all other slots are being accessed) + * then we will remove this entry from the hash and continue + * on to the next chunk, as we may not exceed the limit. + */ + else if (!dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->lru)); + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; + } + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, + victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else + { + /* Can't add this chunk - we don't have the space for it */ + hash_search_with_hash_value(lfc_hash, &entry->key, hash, + HASH_REMOVE, NULL); + + return false; + } + + entry->access_count = 1; + entry->hash = hash; + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + SET_STATE(entry, i, UNAVAILABLE); + + return true; +} + +/* + * Store received prefetch result in LFC cache. + * Unlike lfc_read/lfc_write this call is is not protected by shared buffer lock. + * So we should be ready that other backends will try to concurrently read or write this block. + * We do not store prefetched block if it already exists in LFC or it's not_modified_since LSN is smaller + * than current last written LSN (LwLSN). + * + * We can enforce correctness of storing page in LFC by the following steps: + * 1. Check under LFC lock that page in not present in LFC. + * 2. Check under LFC lock that LwLSN is not changed since prefetch request time (not_modified_since). + * 3. Change page state to "Pending" under LFC lock to prevent all other backends to read or write this + * pages until this write is completed. + * 4. Assume that some other backend creates new image of the page without reading it + * (because reads will be blocked because of 2). This version of the page is stored in shared buffer. + * Any attempt to throw away this page from shared buffer will be blocked, because Postgres first + * needs to save dirty page and write will be blocked because of 2. + * So any backend trying to access this page, will take it from shared buffer without accessing + * SMGR and LFC. + * 5. After write completion we once again obtain LFC lock and wake-up all waiting backends. + * If there is some backend waiting to write new image of the page (4) then now it will be able to + * do it,overwriting old (prefetched) page image. As far as this write will be completed before + * shared buffer can be reassigned, not other backend can see old page image. +*/ +bool +lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn) +{ + BufferTag tag; + FileCacheEntry *entry; + ssize_t rc; + bool found; + uint32 hash; + uint64 generation; + uint32 entry_offset; + instr_time io_start, io_end; + ConditionVariable* cv; + FileCacheBlockState state; + XLogRecPtr lwlsn; + + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return false; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forknum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return false; + } + lwlsn = GetLastWrittenLSN(rinfo, forknum, blkno); + if (lwlsn > lsn) + { + elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", + blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn)); + LWLockRelease(lfc_lock); + return false; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + + if (found) + { + state = GET_STATE(entry, chunk_offs); + if (state != UNAVAILABLE) { + /* Do not rewrite existed LFC entry */ + LWLockRelease(lfc_lock); + return false; + } + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + } + else + { + if (!lfc_init_new_entry(entry, hash)) + { + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + LWLockRelease(lfc_lock); + return false; + } + } + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + + SET_STATE(entry, chunk_offs, PENDING); + + LWLockRelease(lfc_lock); + + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); + rc = pwrite(lfc_desc, buffer, BLCKSZ, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); + pgstat_report_wait_end(); + + if (rc != BLCKSZ) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + uint64 time_spent_us; + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += 1; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + state = GET_STATE(entry, chunk_offs); + if (state == REQUESTED) { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs, AVAILABLE); + } + } + else + { + lfc_close_file(); + } + LWLockRelease(lfc_lock); + } + return true; +} + /* * Put page in local file cache. * If cache is full then evict some other page. @@ -855,15 +1115,21 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; - if (!lfc_ensure_opened()) - return; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + generation = lfc_ctl->generation; + + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -878,6 +1144,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); instr_time io_start, io_end; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -888,14 +1156,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } + cv = &lfc_ctl->cv[hash % N_COND_VARS]; entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); @@ -908,92 +1169,50 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry->access_count++ == 0) dlist_delete(&entry->list_node); } - /*----------- - * If the chunk wasn't already in the LFC then we have these - * options, in order of preference: - * - * Unless there is no space available, we can: - * 1. Use an entry from the `holes` list, and - * 2. Create a new entry. - * We can always, regardless of space in the LFC: - * 3. evict an entry from LRU, and - * 4. ignore the write operation (the least favorite option) - */ - else if (lfc_ctl->used < lfc_ctl->limit) - { - if (!dlist_is_empty(&lfc_ctl->holes)) - { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, - dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool hole_found; - - hash_search_with_hash_value(lfc_hash, &hole->key, - hole->hash, HASH_REMOVE, &hole_found); - CriticalAssert(hole_found); - - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ - } - else - { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++;/* allocate new chunk at end - * of file */ - } - } - /* - * We've already used up all allocated LFC entries. - * - * If we can clear an entry from the LRU, do that. - * If we can't (e.g. because all other slots are being accessed) - * then we will remove this entry from the hash and continue - * on to the next chunk, as we may not exceed the limit. - */ - else if (!dlist_is_empty(&lfc_ctl->lru)) - { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, - dlist_pop_head_node(&lfc_ctl->lru)); - - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) - { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; - } - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, - victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); - } else { - /* Can't add this chunk - we don't have the space for it */ - hash_search_with_hash_value(lfc_hash, &entry->key, hash, - HASH_REMOVE, NULL); - - /* - * We can't process this chunk due to lack of space in LFC, - * so skip to the next one - */ - LWLockRelease(lfc_lock); - blkno += blocks_in_chunk; - buf_offset += blocks_in_chunk; - nblocks -= blocks_in_chunk; - continue; + if (!lfc_init_new_entry(entry, hash)) + { + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + continue; + } } - if (!found) - { - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; entry_offset = entry->offset; + + for (int i = 0; i < blocks_in_chunk; i++) + { + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + SET_STATE(entry, chunk_offs + i, PENDING); + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + } LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); @@ -1006,6 +1225,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); + return; } else { @@ -1029,18 +1249,30 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, for (int i = 0; i < blocks_in_chunk; i++) { - lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1); - entry->bitmap[(chunk_offs + i) >> 5] |= - ((uint32)1 << ((chunk_offs + i) & 31)); + FileCacheBlockState state = GET_STATE(entry, chunk_offs + i); + if (state == REQUESTED) + { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs + i, AVAILABLE); + } } } - - LWLockRelease(lfc_lock); + else + { + /* stop iteration if LFC was disabled */ + lfc_close_file(); + break; + } } blkno += blocks_in_chunk; buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; } + LWLockRelease(lfc_lock); } typedef struct @@ -1127,6 +1359,16 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->used_pages; break; + case 6: + key = "file_cache_evicted_pages"; + if (lfc_ctl) + value = lfc_ctl->evicted_pages; + break; + case 7: + key = "file_cache_limit"; + if (lfc_ctl) + value = lfc_ctl->limit; + break; default: SRF_RETURN_DONE(funcctx); } @@ -1250,8 +1492,8 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - n_pages += pg_popcount32(entry->bitmap[i]); + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += GET_STATE(entry, i) == AVAILABLE; } } } @@ -1279,7 +1521,7 @@ local_cache_pages(PG_FUNCTION_ARGS) { for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31))) + if (GET_STATE(entry, i) == AVAILABLE) { fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index fc1aecd340..f71f11ff93 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,6 +14,8 @@ */ #include "postgres.h" +#include + #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" @@ -61,6 +63,9 @@ int neon_protocol_version = 2; static int max_reconnect_attempts = 60; static int stripe_size; +static int pageserver_response_log_timeout = 10000; +static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */ + typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; @@ -129,6 +134,11 @@ typedef struct uint64 nrequests_sent; uint64 nresponses_received; + /* State for the receive timeout mechanism in call_PQgetCopyData() */ + instr_time receive_start_time; /* when we started waiting */ + instr_time receive_last_log_time; /* when we last printed a log message for the wait */ + bool receive_logged; /* has the wait been logged */ + /*--- * WaitEventSet containing: * - WL_SOCKET_READABLE on 'conn' @@ -661,6 +671,9 @@ pageserver_connect(shardno_t shard_no, int elevel) shard->state = PS_Connected; shard->nrequests_sent = 0; shard->nresponses_received = 0; + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; } /* FALLTHROUGH */ case PS_Connected: @@ -680,6 +693,33 @@ pageserver_connect(shardno_t shard_no, int elevel) Assert(false); } +static void +get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) +{ + *sndbuf = -1; + *recvbuf = -1; + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + if (socketfd != -1) + { + int ioctl_err; + + ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf); + if (ioctl_err!= 0) { + *sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, recvbuf); + if (ioctl_err != 0) { + *recvbuf = -errno; + } + } +#endif +} + /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ @@ -690,46 +730,54 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; instr_time now, - start_ts, since_start, - last_log_ts, since_last_log; - bool logged = false; - - /* - * As a debugging aid, if we don't get a response for a long time, print a - * log message. - * - * 10 s is a very generous threshold, normally we expect a response in a - * few milliseconds. We have metrics to track latencies in normal ranges, - * but in the cases that take exceptionally long, it's useful to log the - * exact timestamps. - */ -#define LOG_INTERVAL_MS INT64CONST(10 * 1000) - - INSTR_TIME_SET_CURRENT(now); - start_ts = last_log_ts = now; - INSTR_TIME_SET_ZERO(since_last_log); retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); if (ret == 0) { - WaitEvent event; + WaitEvent occurred_event; + int noccurred; + double log_timeout, + disconnect_timeout; long timeout; - timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log)); + /* + * Calculate time elapsed since the start, and since the last progress + * log message. On first call, remember the start time. + */ + INSTR_TIME_SET_CURRENT(now); + if (INSTR_TIME_IS_ZERO(shard->receive_start_time)) + { + shard->receive_start_time = now; + INSTR_TIME_SET_ZERO(since_start); + shard->receive_last_log_time = now; + INSTR_TIME_SET_ZERO(since_last_log); + shard->receive_logged = false; + } + else + { + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + } - /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, timeout, &event, 1, - WAIT_EVENT_NEON_PS_READ); + /* Sleep until the log or disconnect timeout is reached. */ + log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log)); + disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start)); + timeout = (long) ceil(Min(log_timeout, disconnect_timeout)); + + noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0) { if (!PQconsumeInput(pageserver_conn)) { @@ -739,49 +787,61 @@ retry: pfree(msg); return -1; } + goto retry; + } + + /* Timeout was reached, or we were interrupted for some other reason */ + INSTR_TIME_SET_CURRENT(now); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + + /* + * As a debugging aid, if we don't get a response to a pageserver request + * for a long time, print a log message. + * + * The default neon.pageserver_response_log_timeout value, 10 s, is + * very generous. Normally we expect a response in a few + * milliseconds. We have metrics to track latencies in normal ranges, + * but in the cases that take exceptionally long, it's useful to log + * the exact timestamps. + */ + if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) + { + int sndbuf; + int recvbuf; + + get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); + + neon_shard_log(shard_no, LOG, + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", + INSTR_TIME_GET_DOUBLE(since_start), + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); + shard->receive_last_log_time = now; + shard->receive_logged = true; } /* - * Print a message to the log if a long time has passed with no - * response. + * If an even longer time has passed without receiving a response from + * the pageserver, disconnect. That triggers a reconnection attempt + * in the caller. + * + * If this happens, the pageserver is likely dead and isn't coming + * back, or there's some kind of a network glitch and the connection + * is permanently gone. Without this, if the pageserver or the network + * connection is dead, it could take a very long time (15 minutes or + * more) until the TCP keepalive timeout notices that. Even if we + * would in fact get a response if we just waited a little longer, + * there's a good chance that we'll get the response sooner by + * reconnecting. */ - INSTR_TIME_SET_CURRENT(now); - since_last_log = now; - INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); - if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS) + if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { - int sndbuf = -1; - int recvbuf = -1; -#ifdef __linux__ - int socketfd; -#endif - - since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - -#ifdef __linux__ - /* - * get kernel's send and recv queue size via ioctl - * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 - */ - socketfd = PQsocket(pageserver_conn); - if (socketfd != -1) { - int ioctl_err; - ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf); - if (ioctl_err!= 0) { - sndbuf = -errno; - } - ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf); - if (ioctl_err != 0) { - recvbuf = -errno; - } - } -#endif - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", - INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); - last_log_ts = now; - logged = true; + neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting", + INSTR_TIME_GET_DOUBLE(since_start)); + pageserver_disconnect(shard_no); + return -1; } goto retry; @@ -791,14 +851,18 @@ retry: * If we logged earlier that the response is taking a long time, log * another message when the response is finally received. */ - if (logged) + if (shard->receive_logged) { INSTR_TIME_SET_CURRENT(now); since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s", + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + neon_shard_log(shard_no, LOG, + "received response from pageserver after %0.3f s", INSTR_TIME_GET_DOUBLE(since_start)); } + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; return ret; } @@ -972,9 +1036,17 @@ pageserver_receive(shardno_t shard_no) pfree(msg); } } + else if (rc == -1 && shard->state == PS_Disconnected) + { + /* If the state is 'Disconnected', the disconnection message was already logged */ + resp = NULL; + } else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg); + pfree(msg); pageserver_disconnect(shard_no); resp = NULL; } @@ -1260,6 +1332,26 @@ pg_init_libpagestore(void) 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.pageserver_response_log_timeout", + "pageserver response log timeout", + "If the pageserver doesn't respond to a request within this timeout," + "a message is printed to the log.", + &pageserver_response_log_timeout, + 10000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", + "pageserver response diconnect timeout", + "If the pageserver doesn't respond to a request within this timeout," + "disconnect and reconnect.", + &pageserver_response_disconnect_timeout, + 120000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + relsize_hash_init(); if (page_server != NULL) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index ce2938cfd5..768d7ae9e8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,7 @@ #include "fmgr.h" #include "miscadmin.h" +#include "pgstat.h" #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" @@ -56,6 +57,7 @@ uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; uint32 WAIT_EVENT_NEON_LFC_WRITE; +uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; uint32 WAIT_EVENT_NEON_PS_STARTING; uint32 WAIT_EVENT_NEON_PS_CONFIGURING; uint32 WAIT_EVENT_NEON_PS_SEND; @@ -409,6 +411,16 @@ ReportSearchPath(void) } } +#if PG_VERSION_NUM < 150000 +/* + * PG14 uses separate backend for stats collector having no access to shared memory. + * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file + * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit + * variable, so we have to declare it here. + */ +static int neon_pgstat_file_size_limit; +#endif + void _PG_init(void) { @@ -466,6 +478,15 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomIntVariable("neon.pgstat_file_size_limit", + "Maximal size of pgstat.stat file saved in Neon storage", + "Zero value disables persisting pgstat.stat file", + &neon_pgstat_file_size_limit, + 0, 0, 1000000, /* disabled by default */ + PGC_SIGHUP, + GUC_UNIT_KB, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the @@ -538,6 +559,7 @@ neon_shmem_startup_hook(void) WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); + WAIT_EVENT_NEON_LFC_CV_WAIT = WaitEventExtensionNew("Neon/FileCache_CvWait"); WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 79aa88b8d3..912e09c3d3 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -28,6 +28,7 @@ extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; extern uint32 WAIT_EVENT_NEON_LFC_WRITE; +extern uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; extern uint32 WAIT_EVENT_NEON_PS_STARTING; extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; extern uint32 WAIT_EVENT_NEON_PS_SEND; @@ -38,6 +39,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; #define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE #define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE +#define WAIT_EVENT_NEON_LFC_CV_WAIT WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fb4ed9522..1fad44bd58 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes) return true; } +/* -------------------------------- + * pq_getmsgint16 - get a binary 2-byte int from a message buffer + * -------------------------------- + */ +uint16 +pq_getmsgint16(StringInfo msg) +{ + return pq_getmsgint(msg, 2); +} + +/* -------------------------------- + * pq_getmsgint32 - get a binary 4-byte int from a message buffer + * -------------------------------- + */ +uint32 +pq_getmsgint32(StringInfo msg) +{ + return pq_getmsgint(msg, 4); +} + /* -------------------------------- * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order * -------------------------------- diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 89683714f1..7480ac28cc 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -8,6 +8,8 @@ #endif bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint16 pq_getmsgint16(StringInfo msg); +uint32 pq_getmsgint32(StringInfo msg); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 7b748d7252..9faab1e4f0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -233,6 +233,7 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; +extern bool lfc_store_prefetch_result; extern shardno_t get_shard_number(BufferTag* tag); @@ -301,14 +302,16 @@ extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int nblocks, bits8 *bitmap); -extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn); + static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void *buffer) { - bits8 rv = 0; + bits8 rv = 1; return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f1087a8ccb..091ad555e0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -162,7 +162,7 @@ static uint32 local_request_counter; * UNUSED ------> REQUESTED --> RECEIVED * ^ : | | * | : v | - * | : TAG_UNUSED | + * | : TAG_REMAINS | * | : | | * +----------------+------------+ * : @@ -181,7 +181,7 @@ typedef enum PrefetchStatus /* must fit in uint8; bits 0x1 are used */ typedef enum { PRFSF_NONE = 0x0, - PRFSF_SEQ = 0x1, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ } PrefetchRequestFlags; typedef struct PrefetchRequest @@ -305,7 +305,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks, const bits8 *mask); + BlockNumber nblocks); static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); @@ -363,6 +363,7 @@ compact_prefetch_buffers(void) target_slot->buftag = source_slot->buftag; target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; target_slot->response = source_slot->response; target_slot->reqid = source_slot->reqid; target_slot->request_lsns = source_slot->request_lsns; @@ -452,6 +453,18 @@ prefetch_pump_state(void) /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } } } @@ -474,8 +487,7 @@ readahead_buffer_resize(int newsize, void *extra) */ if (MyPState->n_requests_inflight > newsize) { - Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize); - prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize)); + prefetch_wait_for(MyPState->ring_unused - newsize - 1); Assert(MyPState->n_requests_inflight <= newsize); } @@ -714,6 +726,18 @@ prefetch_read(PrefetchRequest *slot) /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } return true; } else @@ -865,7 +889,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns else neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1, NULL); + &slot->request_lsns, 1); request.hdr.lsn = slot->request_lsns.request_lsn; request.hdr.not_modified_since = slot->request_lsns.not_modified_since; @@ -891,6 +915,74 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(!found); } +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +static int +prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + inc_getpage_wait(0); + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +#if PG_MAJORVERSION_NUM < 17 +static bool +prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) +{ + bits8 present = 0; + return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; +} +#endif + /* * prefetch_register_bufferv() - register and prefetch buffers * @@ -1014,8 +1106,6 @@ Retry: /* The buffered request is good enough, return that index */ if (is_prefetch) pgBufferUsage.prefetch.duplicates++; - else - pgBufferUsage.prefetch.hits++; continue; } } @@ -1117,6 +1207,7 @@ Retry: slot->buftag = hashkey.buftag; slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; + slot->flags = 0; min_ring_index = Min(min_ring_index, ring_index); @@ -2057,8 +2148,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, */ static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - neon_request_lsns *output, BlockNumber nblocks, - const bits8 *mask) + neon_request_lsns *output, BlockNumber nblocks) { XLogRecPtr last_written_lsns[PG_IOV_MAX]; @@ -2146,9 +2236,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; - if (last_written_lsn > replay_lsn) { /* GetCurrentReplayRecPtr was introduced in v15 */ @@ -2191,8 +2278,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; /* * Use the latest LSN that was evicted from the buffer cache as the * 'not_modified_since' hint. Any pages modified by later WAL records @@ -2414,7 +2499,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonExistsRequest request = { .hdr.tag = T_NeonExistsRequest, @@ -2833,8 +2918,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, while (nblocks > 0) { int iterblocks = Min(nblocks, PG_IOV_MAX); - bits8 lfc_present[PG_IOV_MAX / 8]; - memset(lfc_present, 0, sizeof(lfc_present)); + bits8 lfc_present[PG_IOV_MAX / 8] = {0}; if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, iterblocks, lfc_present) == iterblocks) @@ -2845,12 +2929,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } tag.blockNum = blocknum; - + for (int i = 0; i < PG_IOV_MAX / 8; i++) lfc_present[i] = ~(lfc_present[i]); ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present, true); + nblocks -= iterblocks; blocknum += iterblocks; @@ -3106,7 +3191,8 @@ Retry: } } memcpy(buffer, getpage_resp->page, BLCKSZ); - lfc_write(rinfo, forkNum, blockno, buffer); + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); break; } case T_NeonErrorResponse: @@ -3191,6 +3277,17 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + { + /* Prefetch hit */ + return; + } + /* Try to read from local file cache */ if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { @@ -3198,9 +3295,11 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL @@ -3281,11 +3380,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #if PG_MAJORVERSION_NUM >= 17 static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void **buffers, BlockNumber nblocks) + void **buffers, BlockNumber nblocks) { + bits8 prefetch_hits[PG_IOV_MAX / 8] = {0}; + bits8 lfc_hits[PG_IOV_MAX / 8]; bits8 read[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; + int prefetch_result; switch (reln->smgr_relpersistence) { @@ -3308,38 +3410,52 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_log(ERROR, "Read request too large: %d is larger than max %d", nblocks, PG_IOV_MAX); - memset(read, 0, sizeof(read)); + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks); + + + prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits); + + if (prefetch_result == nblocks) + return; + + /* invert the result: exclude prefetched blocks */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_hits[i] = ~prefetch_hits[i]; /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, read); + nblocks, lfc_hits); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; /* Read all blocks from LFC, so we're done */ - if (lfc_result == nblocks) + if (prefetch_result + lfc_result == nblocks) return; - if (lfc_result == -1) + if (lfc_result <= 0) { /* can't use the LFC result, so read all blocks from PS */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = 0xFF; + read[i] = ~prefetch_hits[i]; } else { /* invert the result: exclude blocks read from lfc */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~(read[i]); + read[i] = ~(prefetch_hits[i] | lfc_hits[i]); } - neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, - request_lsns, nblocks, read); - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL @@ -3611,7 +3727,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonNblocksRequest request = { @@ -3696,7 +3812,7 @@ neon_dbsize(Oid dbNode) NRelFileInfo dummy_node = {0}; neon_get_request_lsns(dummy_node, MAIN_FORKNUM, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonDbSizeRequest request = { @@ -4431,7 +4547,12 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) if (no_redo_needed) { SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); - lfc_evict(rinfo, forknum, blkno); + /* + * Redo changes if page exists in LFC. + * We should perform this check after assigning LwLSN to prevent + * prefetching of some older version of the page by some other backend. + */ + no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno); } LWLockRelease(partitionLock); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7472fd6afc..356895aa82 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); +static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version); static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); @@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); +static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } wp->quorum = wp->n_safekeepers / 2 + 1; + if (wp->config->proto_version != 2 && wp->config->proto_version != 3) + wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + /* Fill the greeting package */ - wp->greetRequest.tag = 'g'; - wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - wp->greetRequest.pgVersion = PG_VERSION_NUM; - wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); - wp->greetRequest.systemId = wp->config->systemId; - if (!wp->config->neon_timeline) - wp_log(FATAL, "neon.timeline_id is not provided"); - if (*wp->config->neon_timeline != '\0' && - !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp->greetRequest.pam.tag = 'g'; if (!wp->config->neon_tenant) wp_log(FATAL, "neon.tenant_id is not provided"); - if (*wp->config->neon_tenant != '\0' && - !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); - - wp->greetRequest.timeline = wp->config->pgTimeline; - wp->greetRequest.walSegSize = wp->config->wal_segment_size; + wp->greetRequest.tenant_id = wp->config->neon_tenant; + if (!wp->config->neon_timeline) + wp_log(FATAL, "neon.timeline_id is not provided"); + wp->greetRequest.timeline_id = wp->config->neon_timeline; + wp->greetRequest.pg_version = PG_VERSION_NUM; + wp->greetRequest.system_id = wp->config->systemId; + wp->greetRequest.wal_seg_size = wp->config->wal_segment_size; wp->api.init_event_set(wp); @@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) void WalProposerFree(WalProposer *wp) { + MembershipConfigurationFree(&wp->mconf); for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; Assert(sk->outbuf.data != NULL); pfree(sk->outbuf.data); + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -598,11 +600,14 @@ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; +#define CMD_LEN 512 + char cmd[CMD_LEN]; - if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version); + if (!wp->api.conn_send_query(sk, cmd)) { - wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", + cmd, sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk) /* * Start handshake: first of all send information about the - * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for * a response to finish the handshake. */ static void SendProposerGreeting(Safekeeper *sk) { + WalProposer *wp = sk->wp; + char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf); + + wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml); + pfree(mconf_toml); + + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest, + &sk->outbuf, wp->config->proto_version); + /* * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ - BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV); + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } static void RecvAcceptorGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; + char *mconf_toml; /* * If our reading doesn't immediately succeed, any necessary error @@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); + mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT, + sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); + pfree(mconf_toml); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk) wp->propTerm++; wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->voteRequest = (VoteRequest) - { - .tag = 'v', - .term = wp->propTerm - }; - memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN); + wp->voteRequest.pam.tag = 'v'; + wp->voteRequest.generation = wp->mconf.generation; + wp->voteRequest.term = wp->propTerm; } } else if (sk->greetResponse.term > wp->propTerm) @@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk) { WalProposer *wp = sk->wp; - /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); - /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) - return; + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest, + &sk->outbuf, wp->config->proto_version); + /* We have quorum for voting, send our vote request */ + wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, + wp->voteRequest.generation, wp->voteRequest.term); + /* On failure, logging & resetting is handled */ + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ } @@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + sk->voteResponse.voteGiven, + GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp) * otherwise we must be sync-safekeepers and we have nothing to do then. * * Proceeding is not only pointless but harmful, because we'd give - * safekeepers term history starting with 0/0. These hacks will go away once - * we disable implicit timeline creation on safekeepers and create it with - * non zero LSN from the start. + * safekeepers term history starting with 0/0. These hacks will go away + * once we disable implicit timeline creation on safekeepers and create it + * with non zero LSN from the start. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr) { @@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; wp->truncateLsn = InvalidXLogRecPtr; - wp->timelineStartLsn = InvalidXLogRecPtr; for (int i = 0; i < wp->n_safekeepers; i++) { @@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->donor = i; } wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); - - if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) - { - /* timelineStartLsn should be the same everywhere or unknown */ - if (wp->timelineStartLsn != InvalidXLogRecPtr && - wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) - { - wp_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); - } - wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; - } } } @@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); - if (wp->timelineStartLsn == InvalidXLogRecPtr) - { - wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); - } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); - /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so - * it should never be zero at this point, if we know timelineStartLsn. - * - * timelineStartLsn can be zero only on the first syncSafekeepers run. - */ - Assert((wp->truncateLsn != InvalidXLogRecPtr) || - (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn)); + Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers); /* * We will be generating WAL since propEpochStartLsn, so we should set @@ -1053,10 +1045,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if last_log_term on the node which gave - * the highest vote (i.e. point where we are going to start writing) - * actually had been won by me; plain restart of walproposer not - * intervened by concurrent compute which wrote WAL is ok. + * However, allow to proceed if last_log_term on the node which + * gave the highest vote (i.e. point where we are going to start + * writing) actually had been won by me; plain restart of + * walproposer not intervened by concurrent compute which wrote + * WAL is ok. * * This avoids compute crash after manual term_bump. */ @@ -1126,14 +1119,8 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - - /* - * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline - * is created manually (test_s3_wal_replay) - */ - Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries); } else { @@ -1158,29 +1145,19 @@ SendProposerElected(Safekeeper *sk) Assert(sk->startStreamingAt <= wp->availableLsn); - msg.tag = 'e'; + msg.apm.tag = 'e'; + msg.generation = wp->mconf.generation; msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; msg.termHistory = &wp->propTermHistory; - msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0; wp_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); - - resetStringInfo(&sk->outbuf); - pq_sendint64_le(&sk->outbuf, msg.tag); - pq_sendint64_le(&sk->outbuf, msg.term); - pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); - pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); - for (int i = 0; i < msg.termHistory->n_entries; i++) - { - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); - } - pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s", + sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), + lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version); if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) return; @@ -1246,14 +1223,13 @@ static void PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); - req->tag = 'a'; + req->apm.tag = 'a'; + req->generation = wp->mconf.generation; req->term = wp->propTerm; - req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; - req->proposerId = wp->greetRequest.proposerId; } /* @@ -1354,7 +1330,8 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version); + /* prepare for reading WAL into the outbuf */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); sk->active_state = SS_ACTIVE_READ_WAL; } @@ -1367,14 +1344,17 @@ SendAppendRequests(Safekeeper *sk) req = &sk->appendRequest; req_len = req->endLsn - req->beginLsn; - /* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */ + /* + * We send zero sized AppenRequests as heartbeats; don't wal_read + * for these. + */ if (req_len > 0) { switch (wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req_len, - &errmsg)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req_len, + &errmsg)) { case NEON_WALREAD_SUCCESS: break; @@ -1382,7 +1362,7 @@ SendAppendRequests(Safekeeper *sk) return true; case NEON_WALREAD_ERROR: wp_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); + sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: @@ -1470,11 +1450,11 @@ RecvAppendResponses(Safekeeper *sk) * Term has changed to higher one, probably another compute is * running. If this is the case we could PANIC as well because * likely it inserted some data and our basebackup is unsuitable - * anymore. However, we also bump term manually (term_bump endpoint) - * on safekeepers for migration purposes, in this case we do want - * compute to stay alive. So restart walproposer with FATAL instead - * of panicking; if basebackup is spoiled next election will notice - * this. + * anymore. However, we also bump term manually (term_bump + * endpoint) on safekeepers for migration purposes, in this case + * we do want compute to stay alive. So restart walproposer with + * FATAL instead of panicking; if basebackup is spoiled next + * election will notice this. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, @@ -1509,7 +1489,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { - const char *key = pq_getmsgstring(reply_message); + const char *key = pq_getmsgrawstring(reply_message); unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) @@ -1750,6 +1730,213 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } } +/* Serialize MembershipConfiguration into buf. */ +static void +MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + pq_sendint32(buf, mconf->generation); + + pq_sendint32(buf, mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + pq_sendint64(buf, mconf->members.m[i].node_id); + pq_send_ascii_string(buf, mconf->members.m[i].host); + pq_sendint16(buf, mconf->members.m[i].port); + } + + /* + * There is no special mark for absent new_members; zero members in + * invalid, so zero len means absent. + */ + pq_sendint32(buf, mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + pq_sendint64(buf, mconf->new_members.m[i].node_id); + pq_send_ascii_string(buf, mconf->new_members.m[i].host); + pq_sendint16(buf, mconf->new_members.m[i].port); + } +} + +/* Serialize proposer -> acceptor message into buf using specified version */ +static void +PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version) +{ + /* both version are supported currently until we fully migrate to 3 */ + Assert(proto_version == 3 || proto_version == 2); + + resetStringInfo(buf); + + if (proto_version == 3) + { + /* + * v2 sends structs for some messages as is, so commonly send tag only + * for v3 + */ + pq_sendint8(buf, msg->tag); + + switch (msg->tag) + { + case 'g': + { + ProposerGreeting *m = (ProposerGreeting *) msg; + + pq_send_ascii_string(buf, m->tenant_id); + pq_send_ascii_string(buf, m->timeline_id); + MembershipConfigurationSerialize(&m->mconf, buf); + pq_sendint32(buf, m->pg_version); + pq_sendint64(buf, m->system_id); + pq_sendint32(buf, m->wal_seg_size); + break; + } + case 'v': + { + VoteRequest *m = (VoteRequest *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + break; + + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->startStreamingAt); + pq_sendint32(buf, m->termHistory->n_entries); + for (uint32 i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64(buf, m->termHistory->entries[i].term); + pq_sendint64(buf, m->termHistory->entries[i].lsn); + } + break; + } + case 'a': + { + /* + * Note: this serializes only AppendRequestHeader, caller + * is expected to append WAL data later. + */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->beginLsn); + pq_sendint64(buf, m->endLsn); + pq_sendint64(buf, m->commitLsn); + pq_sendint64(buf, m->truncateLsn); + break; + } + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + + if (proto_version == 2) + { + switch (msg->tag) + { + case 'g': + { + /* v2 sent struct as is */ + ProposerGreeting *m = (ProposerGreeting *) msg; + ProposerGreetingV2 greetRequestV2; + + /* Fill also v2 struct. */ + greetRequestV2.tag = 'g'; + greetRequestV2.protocolVersion = proto_version; + greetRequestV2.pgVersion = m->pg_version; + + /* + * v3 removed this field because it's easier to pass as + * libq or START_WAL_PUSH options + */ + memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId)); + greetRequestV2.systemId = wp->config->systemId; + if (*m->timeline_id != '\0' && + !HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16)) + wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id); + if (*m->tenant_id != '\0' && + !HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16)) + wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id); + + greetRequestV2.timeline = wp->config->pgTimeline; + greetRequestV2.walSegSize = wp->config->wal_segment_size; + + pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2)); + break; + } + case 'v': + { + /* v2 sent struct as is */ + VoteRequest *m = (VoteRequest *) msg; + VoteRequestV2 voteRequestV2; + + voteRequestV2.tag = m->pam.tag; + voteRequestV2.term = m->term; + /* removed field */ + memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId)); + pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2)); + break; + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint64_le(buf, m->apm.tag); + pq_sendint64_le(buf, m->term); + pq_sendint64_le(buf, m->startStreamingAt); + pq_sendint32_le(buf, m->termHistory->n_entries); + for (int i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64_le(buf, m->termHistory->entries[i].term); + pq_sendint64_le(buf, m->termHistory->entries[i].lsn); + } + /* + * Removed timeline_start_lsn. Still send it as a valid + * value until safekeepers taking it from term history are + * deployed. + */ + pq_sendint64_le(buf, m->termHistory->entries[0].lsn); + break; + } + case 'a': + + /* + * Note: this serializes only AppendRequestHeader, caller is + * expected to append WAL data later. + */ + { + /* v2 sent struct as is */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + AppendRequestHeaderV2 appendRequestHeaderV2; + + appendRequestHeaderV2.tag = m->apm.tag; + appendRequestHeaderV2.term = m->term; + appendRequestHeaderV2.epochStartLsn = 0; /* removed field */ + appendRequestHeaderV2.beginLsn = m->beginLsn; + appendRequestHeaderV2.endLsn = m->endLsn; + appendRequestHeaderV2.commitLsn = m->commitLsn; + appendRequestHeaderV2.truncateLsn = m->truncateLsn; + /* removed field */ + memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId)); + + pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2)); + break; + } + + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + wp_log(FATAL, "unexpected proto_version %d", proto_version); +} + /* * Try to read CopyData message from i'th safekeeper, resetting connection on * failure. @@ -1779,6 +1966,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; } +/* Deserialize membership configuration from buf to mconf. */ +static void +MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + mconf->generation = pq_getmsgint32(buf); + mconf->members.len = pq_getmsgint32(buf); + mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + const char *buf_host; + + mconf->members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host)); + mconf->members.m[i].port = pq_getmsgint16(buf); + } + mconf->new_members.len = pq_getmsgint32(buf); + mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + const char *buf_host; + + mconf->new_members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host)); + mconf->new_members.m[i].port = pq_getmsgint16(buf); + } +} + /* * Read next message with known type into provided struct, by reading a CopyData * block from the safekeeper's postgres connection, returning whether the read @@ -1787,6 +2005,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * If the read needs more polling, we return 'false' and keep the state * unmodified, waiting until it becomes read-ready to try again. If it fully * failed, a warning is emitted and the connection is reset. + * + * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields. */ static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) @@ -1795,82 +2015,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) char *buf; int buf_size; - uint64 tag; + uint8 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) return false; + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* parse it */ s.data = buf; s.len = buf_size; + s.maxlen = buf_size; s.cursor = 0; - tag = pq_getmsgint64_le(&s); - if (tag != anymsg->tag) + if (wp->config->proto_version == 3) { - wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk)); - ResetConnection(sk); - return false; - } - sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); - switch (tag) - { - case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) + tag = pq_getmsgbyte(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->nodeId = pq_getmsgint64(&s); + MembershipConfigurationDeserialize(&msg->mconf, &s); + msg->term = pq_getmsgint64(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; - case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->voteGiven = pq_getmsgbyte(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->truncateLsn = pq_getmsgint64(&s); + msg->termHistory.n_entries = pq_getmsgint32(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (uint32 i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64(&s); + } + pq_getmsgend(&s); + return true; + } + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (s.len > s.cursor) - ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); - else - msg->ps_feedback.present = false; - pq_getmsgend(&s); - return true; - } - - default: - { - Assert(false); - return false; - } + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->commitLsn = pq_getmsgint64(&s); + msg->hs.ts = pq_getmsgint64(&s); + msg->hs.xmin.value = pq_getmsgint64(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } } + else if (wp->config->proto_version == 2) + { + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + pq_getmsgint64_le(&s); /* timelineStartLsn */ + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } + } + wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); + return false; /* keep the compiler quiet */ } /* @@ -2246,3 +2538,45 @@ FormatEvents(WalProposer *wp, uint32 events) return (char *) &return_str; } + +/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */ +static char * +MembershipConfigurationToString(MembershipConfiguration *mconf) +{ + StringInfoData s; + uint32 i; + + initStringInfo(&s); + appendStringInfo(&s, "{gen = %u", mconf->generation); + appendStringInfoString(&s, ", members = ["); + for (i = 0; i < mconf->members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port); + } + appendStringInfo(&s, "], new_members = ["); + for (i = 0; i < mconf->new_members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port); + } + appendStringInfoString(&s, "]}"); + return s.data; +} + +static void +MembershipConfigurationFree(MembershipConfiguration *mconf) +{ + if (mconf->members.m) + pfree(mconf->members.m); + mconf->members.m = NULL; + if (mconf->new_members.m) + pfree(mconf->new_members.m); + mconf->new_members.m = NULL; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d8c44f8182..eee55f924f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -12,9 +12,6 @@ #include "neon_walreader.h" #include "pagestore_client.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 - #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ @@ -143,12 +140,71 @@ typedef uint64 term_t; /* neon storage node id */ typedef uint64 NNodeId; +/* + * Number uniquely identifying safekeeper membership configuration. + * This and following structs pair ones in membership.rs. + */ +typedef uint32 Generation; + +typedef struct SafekeeperId +{ + NNodeId node_id; + char host[MAXCONNINFO]; + uint16 port; +} SafekeeperId; + +/* Set of safekeepers. */ +typedef struct MemberSet +{ + uint32 len; /* number of members */ + SafekeeperId *m; /* ids themselves */ +} MemberSet; + +/* Timeline safekeeper membership configuration. */ +typedef struct MembershipConfiguration +{ + Generation generation; + MemberSet members; + /* Has 0 n_members in non joint conf. */ + MemberSet new_members; +} MembershipConfiguration; + /* * Proposer <-> Acceptor messaging. */ +typedef struct ProposerAcceptorMessage +{ + uint8 tag; +} ProposerAcceptorMessage; + /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting +{ + ProposerAcceptorMessage pam; /* message tag */ + + /* + * tenant/timeline ids as C strings with standard hex notation for ease of + * printing. In principle they are not strictly needed as ttid is also + * passed as libpq options. + */ + char *tenant_id; + char *timeline_id; + /* Full conf is carried to allow safekeeper switch */ + MembershipConfiguration mconf; + + /* + * pg_version and wal_seg_size are used for timeline creation until we + * fully migrate to doing externally. systemId is only used as a sanity + * cross check. + */ + uint32 pg_version; /* in PG_VERSION_NUM format */ + uint64 system_id; /* Postgres system identifier. */ + uint32 wal_seg_size; +} ProposerGreeting; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct ProposerGreetingV2 { uint64 tag; /* message tag */ uint32 protocolVersion; /* proposer-safekeeper protocol version */ @@ -159,32 +215,42 @@ typedef struct ProposerGreeting uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; -} ProposerGreeting; +} ProposerGreetingV2; typedef struct AcceptorProposerMessage { - uint64 tag; + uint8 tag; } AcceptorProposerMessage; /* - * Acceptor -> Proposer initial response: the highest term acceptor voted for. + * Acceptor -> Proposer initial response: the highest term acceptor voted for, + * its node id and configuration. */ typedef struct AcceptorGreeting { AcceptorProposerMessage apm; - term_t term; NNodeId nodeId; + MembershipConfiguration mconf; + term_t term; } AcceptorGreeting; /* * Proposer -> Acceptor vote request. */ typedef struct VoteRequest +{ + ProposerAcceptorMessage pam; /* message tag */ + Generation generation; /* membership conf generation */ + term_t term; +} VoteRequest; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct VoteRequestV2 { uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; +} VoteRequestV2; /* Element of term switching chain. */ typedef struct TermSwitchEntry @@ -203,8 +269,15 @@ typedef struct TermHistory typedef struct VoteResponse { AcceptorProposerMessage apm; + + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; term_t term; - uint64 voteGiven; + uint8 voteGiven; /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow @@ -214,7 +287,6 @@ typedef struct VoteResponse XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ } VoteResponse; /* @@ -223,20 +295,37 @@ typedef struct VoteResponse */ typedef struct ProposerElected { - uint64 tag; + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ term_t term; /* proposer will send since this point */ XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; - /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; } ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader +{ + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ + term_t term; /* term of the proposer */ + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + + /* + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeader; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct AppendRequestHeaderV2 { uint64 tag; term_t term; /* term of the proposer */ @@ -256,7 +345,8 @@ typedef struct AppendRequestHeader */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeaderV2; /* * Hot standby feedback received from replica @@ -309,6 +399,13 @@ typedef struct AppendResponse { AcceptorProposerMessage apm; + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. @@ -644,6 +741,8 @@ typedef struct WalProposerConfig /* Will be passed to safekeepers in greet request. */ TimeLineID pgTimeline; + int proto_version; + #ifdef WALPROPOSER_LIB void *callback_data; #endif @@ -656,11 +755,14 @@ typedef struct WalProposerConfig typedef struct WalProposer { WalProposerConfig *config; - int n_safekeepers; + /* Current walproposer membership configuration */ + MembershipConfiguration mconf; /* (n_safekeepers / 2) + 1 */ int quorum; + /* Number of occupied slots in safekeepers[] */ + int n_safekeepers; Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ @@ -670,6 +772,7 @@ typedef struct WalProposer XLogRecPtr commitLsn; ProposerGreeting greetRequest; + ProposerGreetingV2 greetRequestV2; /* Vote request for safekeeper */ VoteRequest voteRequest; diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 35d984c52e..a986160224 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen) } /* -------------------------------- - * pq_getmsgstring - get a null-terminated text string (with conversion) + * pq_getmsgrawstring - get a null-terminated text string - NO conversion * - * May return a pointer directly into the message buffer, or a pointer - * to a palloc'd conversion result. + * Returns a pointer directly into the message buffer. * -------------------------------- */ const char * -pq_getmsgstring(StringInfo msg) +pq_getmsgrawstring(StringInfo msg) { char *str; int slen; @@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg) ExceptionalCondition("invalid msg format", __FILE__, __LINE__); } +/* -------------------------------- + * pq_sendbytes - append raw data to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbytes(StringInfo buf, const void *data, int datalen) +{ + /* use variant that maintains a trailing null-byte, out of caution */ + appendBinaryStringInfo(buf, data, datalen); +} + +/* -------------------------------- + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * + * This function intentionally bypasses encoding conversion, instead just + * silently replacing any non-7-bit-ASCII characters with question marks. + * It is used only when we are having trouble sending an error message to + * the client with normal localization and encoding conversion. The caller + * should already have taken measures to ensure the string is just ASCII; + * the extra work here is just to make certain we don't send a badly encoded + * string to the client (which might or might not be robust about that). + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_send_ascii_string(StringInfo buf, const char *str) +{ + while (*str) + { + char ch = *str++; + + if (IS_HIGHBIT_SET(ch)) + ch = '?'; + appendStringInfoCharMacro(buf, ch); + } + appendStringInfoChar(buf, '\0'); +} /* * Produce a C-string representation of a TimestampTz. diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 86444084ff..b21184de57 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -59,9 +59,11 @@ #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" +/* GUCs */ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +int safekeeper_proto_version = 2; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers) else walprop_config.systemId = 0; walprop_config.pgTimeline = walprop_pg_get_timeline_id(); + walprop_config.proto_version = safekeeper_proto_version; } /* @@ -219,25 +222,37 @@ nwp_register_gucs(void) PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_proto_version", + "Version of compute <-> safekeeper protocol.", + "Used while migrating from 2 to 3.", + &safekeeper_proto_version, + 2, 0, INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); } static int split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) { - int n_safekeepers = 0; - char *curr_sk = safekeepers_list; + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) { - if (++n_safekeepers >= MAX_SAFEKEEPERS) { + if (++n_safekeepers >= MAX_SAFEKEEPERS) + { wpg_log(FATAL, "too many safekeepers"); } coma = strchr(coma, ','); - safekeepers[n_safekeepers-1] = curr_sk; + safekeepers[n_safekeepers - 1] = curr_sk; - if (coma != NULL) { + if (coma != NULL) + { *coma++ = '\0'; } } @@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) static bool safekeepers_cmp(char *old, char *new) { - char *safekeepers_old[MAX_SAFEKEEPERS]; - char *safekeepers_new[MAX_SAFEKEEPERS]; - int len_old = 0; - int len_new = 0; + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); @@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra) if (!am_walproposer) return; - if (!newval) { + if (!newval) + { /* should never happen */ wpg_log(FATAL, "neon.safekeepers is empty"); } @@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra) newval_copy = pstrdup(newval); oldval = pstrdup(wal_acceptors_list); - /* + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before - * next bgw start. We should refactor walproposer to allow graceful exit and - * thus remove this delay. - * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. + * next bgw start. We should refactor walproposer to allow graceful exit + * and thus remove this delay. XXX: If you change anything here, sync with + * test_safekeepers_reconfigure_reorder. */ if (!safekeepers_cmp(oldval, newval_copy)) { @@ -454,7 +470,8 @@ backpressure_throttling_impl(void) memcpy(new_status, old_status, len); snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); set_ps_display(new_status); - new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ + new_status[len] = '\0'; /* truncate off " backpressure ..." to later + * reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); @@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) wpg_log(LOG, "WAL proposer starts streaming at %X/%X", LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = wp->greetRequest.timeline; + cmd.timeline = wp->config->pgTimeline; cmd.startpoint = startpos; StartProposerReplication(wp, &cmd); } @@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) FullTransactionId xmin = hsFeedback.xmin; FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; FullTransactionId next_xid = ReadNextFullTransactionId(); + /* - * Page server is updating nextXid in checkpoint each 1024 transactions, - * so feedback xmin can be actually larger then nextXid and - * function TransactionIdInRecentPast return false in this case, + * Page server is updating nextXid in checkpoint each 1024 + * transactions, so feedback xmin can be actually larger then nextXid + * and function TransactionIdInRecentPast return false in this case, * preventing update of slot's xmin. */ if (FullTransactionIdPrecedes(next_xid, xmin)) diff --git a/poetry.lock b/poetry.lock index d66c3aae7a..ba3b0535e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -122,7 +122,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.12.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiopg" @@ -160,30 +160,30 @@ frozenlist = ">=1.1.0" [[package]] name = "allure-pytest" -version = "2.13.2" +version = "2.13.5" description = "Allure pytest integration" optional = false python-versions = "*" groups = ["main"] files = [ - {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"}, - {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"}, + {file = "allure-pytest-2.13.5.tar.gz", hash = "sha256:0ef8e1790c44a988db6b83c4d4f5e91451e2c4c8ea10601dfa88528d23afcf6e"}, + {file = "allure_pytest-2.13.5-py3-none-any.whl", hash = "sha256:94130bac32964b78058e62cf4b815ad97a5ac82a065e6dd2d43abac2be7640fc"}, ] [package.dependencies] -allure-python-commons = "2.13.2" +allure-python-commons = "2.13.5" pytest = ">=4.5.0" [[package]] name = "allure-python-commons" -version = "2.13.2" -description = "Common module for integrate allure with python-based frameworks" +version = "2.13.5" +description = "('Contains the API for end users as well as helper functions and classes to build Allure adapters for Python test frameworks',)" optional = false python-versions = ">=3.6" groups = ["main"] files = [ - {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"}, - {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"}, + {file = "allure-python-commons-2.13.5.tar.gz", hash = "sha256:a232e7955811f988e49a4c1dd6c16cce7e9b81d0ea0422b1e5654d3254e2caf3"}, + {file = "allure_python_commons-2.13.5-py3-none-any.whl", hash = "sha256:8b0e837b6e32d810adec563f49e1d04127a5b6770e0232065b7cb09b9953980d"}, ] [package.dependencies] @@ -232,7 +232,7 @@ sniffio = ">=1.1" [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\""] trio = ["trio (>=0.23)"] [[package]] @@ -308,8 +308,8 @@ files = [ [package.extras] docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"] -gssauth = ["gssapi", "sspilib"] -test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi", "k5test", "mypy (>=1.8.0,<1.9.0)", "sspilib", "uvloop (>=0.15.3)"] +gssauth = ["gssapi ; platform_system != \"Windows\"", "sspilib ; platform_system == \"Windows\""] +test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi ; platform_system == \"Linux\"", "k5test ; platform_system == \"Linux\"", "mypy (>=1.8.0,<1.9.0)", "sspilib ; platform_system == \"Windows\"", "uvloop (>=0.15.3) ; platform_system != \"Windows\" and python_version < \"3.14.0\""] [[package]] name = "attrs" @@ -324,10 +324,10 @@ files = [ ] [package.extras] -dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] -tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] -tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests-no-zope = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -1074,10 +1074,10 @@ files = [ cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""] docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] -nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] -pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""] +pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] @@ -1359,7 +1359,7 @@ idna = "*" sniffio = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -1545,8 +1545,8 @@ files = [ [package.extras] docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] -testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] -testing-libs = ["simplejson", "ujson", "yajl"] +testing = ["ecdsa", "enum34 ; python_version == \"2.7\"", "feedparser", "jsonlib ; python_version == \"2.7\"", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0) ; python_version <= \"3.6\"", "pytest-flake8 (>=1.1.1) ; python_version >= \"3.7\"", "scikit-learn", "sqlalchemy"] +testing-libs = ["simplejson", "ujson", "yajl ; python_version == \"2.7\""] [[package]] name = "jsonpointer" @@ -1867,7 +1867,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -2330,7 +2330,7 @@ files = [ ] [package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +test = ["enum34 ; python_version <= \"3.4\"", "ipaddress ; python_version < \"3.0\"", "mock ; python_version < \"3.0\"", "pywin32 ; sys_platform == \"win32\"", "wmi ; sys_platform == \"win32\""] [[package]] name = "psycopg2-binary" @@ -2456,7 +2456,7 @@ typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" @@ -3068,7 +3068,7 @@ requests = ">=2.30.0,<3.0" urllib3 = ">=1.25.10,<3.0" [package.extras] -tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli ; python_version < \"3.11\"", "tomli-w", "types-PyYAML", "types-requests"] [[package]] name = "rfc3339-validator" @@ -3161,7 +3161,7 @@ files = [ [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -3407,8 +3407,8 @@ files = [ ] [package.extras] -brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a" +content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" diff --git a/pre-commit.py b/pre-commit.py index c9567e0c50..09139459d5 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -29,12 +29,12 @@ def colorify( return f"{color.value}{s}{NC}" -def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2021" +def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str: + cmd = "cargo fmt" if not fix_inplace: cmd += " --check" if no_color: - cmd += " --color=never" + cmd += " -- --color=never" return cmd @@ -61,14 +61,23 @@ def get_commit_files() -> list[str]: return files.decode().splitlines() -def check(name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False): +def check( + name: str, + suffix: str, + cmd: str, + changed_files: list[str], + no_color: bool = False, + append_files_to_cmd: bool = True, +): print(f"Checking: {name} ", end="") applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files)) if not applicable_files: print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color)) return - cmd = f'{cmd} {" ".join(applicable_files)}' + if append_files_to_cmd: + cmd = f"{cmd} {' '.join(applicable_files)}" + res = subprocess.run(cmd.split(), capture_output=True) if res.returncode != 0: print(colorify("[FAILED]", Color.RED, no_color)) @@ -100,15 +109,13 @@ if __name__ == "__main__": args = parser.parse_args() files = get_commit_files() - # we use rustfmt here because cargo fmt does not accept list of files - # it internally gathers project files and feeds them to rustfmt - # so because we want to check only files included in the commit we use rustfmt directly check( - name="rustfmt", + name="cargo fmt", suffix=".rs", - cmd=rustfmt(fix_inplace=args.fix_inplace, no_color=args.no_color), + cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color), changed_files=files, no_color=args.no_color, + append_files_to_cmd=False, ) check( name="ruff check", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 6a381bf094..5964b76ecf 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "proxy" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 7503b4eac9..dd48384c03 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -8,16 +8,16 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; use super::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 5d032c0deb..942f1e13d1 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -6,9 +6,9 @@ use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; -use reqwest::{redirect, Client}; -use reqwest_retry::policies::ExponentialBackoff; +use reqwest::{Client, redirect}; use reqwest_retry::RetryTransientMiddleware; +use reqwest_retry::policies::ExponentialBackoff; use serde::de::Visitor; use serde::{Deserialize, Deserializer}; use serde_json::value::RawValue; @@ -498,8 +498,8 @@ fn verify_rsa_signature( alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::pkcs1v15::{Signature, VerifyingKey}; use rsa::RsaPublicKey; + use rsa::pkcs1v15::{Signature, VerifyingKey}; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index d10f0e82b2..9c3a3772cd 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -8,8 +8,8 @@ use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestContext; -use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::http; use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; use crate::types::EndpointId; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index dc595844c5..83feed5094 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -18,7 +18,7 @@ use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::{ - self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, + self, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, validate_password_and_exchange, }; use crate::cache::Cached; use crate::config::AuthenticationConfig; @@ -32,8 +32,8 @@ use crate::control_plane::{ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::protocol2::ConnectionInfoExtra; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; @@ -308,10 +308,7 @@ async fn auth_quirks( let incoming_vpc_endpoint_id = match ctx.extra() { None => return Err(AuthError::MissingEndpointName), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?; @@ -451,7 +448,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { Ok((Backend::ControlPlane(api, credentials), ip_allowlist)) } Self::Local(_) => { - return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) + return Err(auth::AuthError::bad_auth_method("invalid for local proxy")); } }; @@ -545,7 +542,7 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use super::jwt::JwkCache; - use super::{auth_quirks, AuthRateLimiter}; + use super::{AuthRateLimiter, auth_quirks}; use crate::auth::backend::MaskedIp; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; @@ -556,8 +553,8 @@ mod tests { }; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; - use crate::scram::threadpool::ThreadPool; use crate::scram::ServerSecret; + use crate::scram::threadpool::ThreadPool; use crate::stream::{PqStream, Stream}; struct Auth { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index eff49a402a..c1b7718e4f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -197,7 +197,10 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") + write!( + formatter, + "comma separated list with ip address, ip address range, or ip address subnet mask" + ) } fn visit_str(self, v: &str) -> Result @@ -252,8 +255,8 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { - use serde_json::json; use ComputeUserInfoParseError::*; + use serde_json::json; use super::*; diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 6082695a6b..5670f8e43d 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -5,13 +5,13 @@ pub use backend::Backend; mod credentials; pub(crate) use credentials::{ - check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, - ComputeUserInfoParseError, IpPattern, + ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, check_peer_addr_is_in_list, + endpoint_sni, }; mod password_hack; -pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; +pub(crate) use password_hack::parse_endpoint_param; mod flow; use std::io; diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index 4ab11f828c..dedd225cba 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_api::spec::LocalProxySpec; @@ -19,7 +19,7 @@ use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; -use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend}; use crate::auth::{self}; use crate::cancellation::CancellationHandler; use crate::config::{ diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 94e771a61c..1aa290399c 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -5,24 +5,24 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{Context, anyhow, bail, ensure}; use clap::Arg; -use futures::future::Either; use futures::TryFutureExt; +use futures::future::Either; use itertools::Itertools; use rustls::crypto::ring; use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; +use tracing::{Instrument, error, info}; use utils::project_git_version; use utils::sentry_init::init_sentry; use crate::context::RequestContext; use crate::metrics::{Metrics, ThreadPoolMetrics}; use crate::protocol2::ConnectionInfo; -use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::proxy::{ErrorSource, copy_bidirectional_client_compute, run_until_cancelled}; use crate::stream::{PqStream, Stream}; use crate::tls::TlsServerEndPoint; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index b72799df54..eec0bf8f99 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -9,16 +9,16 @@ use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; +use tracing::{Instrument, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use crate::cancellation::{handle_cancel_messages, CancellationHandler}; +use crate::cancellation::{CancellationHandler, handle_cancel_messages}; use crate::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, - ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, + self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, + ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, }; use crate::context::parquet::ParquetUploadArgs; use crate::http::health_server::AppMetrics; @@ -30,8 +30,8 @@ use crate::redis::connection_with_credentials_provider::ConnectionWithCredential use crate::redis::kv_ops::RedisKVClient; use crate::redis::{elasticache, notifications}; use crate::scram::threadpool::ThreadPool; -use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; +use crate::serverless::cancel_set::CancelSet; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::{auth, control_plane, http, serverless, usage_metrics}; @@ -331,7 +331,9 @@ pub async fn run() -> anyhow::Result<()> { ), ), (None, None) => { - warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); None } _ => { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 7651eb71a2..e153e9f61f 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,12 +1,12 @@ use std::collections::HashSet; use std::convert::Infallible; -use std::sync::atomic::AtomicU64; use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::time::Duration; use async_trait::async_trait; use clashmap::ClashMap; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use smol_str::SmolStr; use tokio::sync::Mutex; use tokio::time::Instant; diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 06eaeb9a30..7cfe5100ea 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -11,11 +11,11 @@ use std::time::{Duration, Instant}; // This severely hinders its usage both in terms of creating wrappers and supported key types. // // On the other hand, `hashlink` has good download stats and appears to be maintained. -use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use hashlink::{LruCache, linked_hash_map::RawEntryMut}; use tracing::debug; use super::common::Cached; -use super::{timed_lru, Cache}; +use super::{Cache, timed_lru}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 1f9c8a48b7..8263e5aa2a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,8 +3,8 @@ use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::tls::MakeTlsConnect; use postgres_client::CancelToken; +use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -13,7 +13,7 @@ use tokio::sync::{mpsc, oneshot}; use tracing::{debug, info}; use crate::auth::backend::ComputeUserInfo; -use crate::auth::{check_peer_addr_is_in_list, AuthError}; +use crate::auth::{AuthError, check_peer_addr_is_in_list}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; @@ -358,10 +358,7 @@ impl CancellationHandler { let incoming_vpc_endpoint_id = match ctx.extra() { None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 460e0cff54..1bcd22e98f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,18 +2,18 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, ensure, Context, Ok}; +use anyhow::{Context, Ok, bail, ensure}; use clap::ValueEnum; use remote_storage::RemoteStorageConfig; -use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::AuthRateLimiter; +use crate::auth::backend::jwt::JwkCache; use crate::control_plane::locks::ApiLocks; use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; use crate::scram::threadpool::ThreadPool; -use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; -pub use crate::tls::server_config::{configure_tls, TlsConfig}; +use crate::serverless::cancel_set::CancelSet; +pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { @@ -97,8 +97,7 @@ pub struct EndpointCacheConfig { impl EndpointCacheConfig { /// Default options for [`crate::control_plane::NodeInfoCache`]. /// Notice that by default the limiter is empty, which means that cache is disabled. - pub const CACHE_DEFAULT_OPTIONS: &'static str = - "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; /// Parse cache options passed via cmdline. /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 1044f5f8e2..4662860b3f 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use futures::{FutureExt, TryFutureExt}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, Instrument}; +use tracing::{Instrument, debug, error, info}; use crate::auth::backend::ConsoleRedirectBackend; use crate::cancellation::CancellationHandler; @@ -11,12 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; +use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ - prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, + ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, }; pub async fn task_main( @@ -64,22 +64,34 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { error!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { error!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } } @@ -118,10 +130,16 @@ pub async fn task_main( match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - error!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -241,6 +259,7 @@ pub(crate) async fn handle_client( Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id: None, compute: node, session_id: ctx.session_id(), cancel: session, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 3236b2e1bf..f87f4e9ef8 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{debug, error, info_span, Span}; +use tracing::{Span, debug, error, info_span}; use try_lock::TryLock; use uuid::Uuid; @@ -55,6 +55,7 @@ struct RequestContextInner { dbname: Option, user: Option, application: Option, + user_agent: Option, error_kind: Option, pub(crate) auth_method: Option, jwt_issuer: Option, @@ -100,6 +101,7 @@ impl Clone for RequestContext { dbname: inner.dbname.clone(), user: inner.user.clone(), application: inner.application.clone(), + user_agent: inner.user_agent.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), @@ -149,6 +151,7 @@ impl RequestContext { dbname: None, user: None, application: None, + user_agent: None, error_kind: None, auth_method: None, jwt_issuer: None, @@ -245,6 +248,13 @@ impl RequestContext { .set_user(user); } + pub(crate) fn set_user_agent(&self, user_agent: Option) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user_agent(user_agent); + } + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); @@ -384,6 +394,10 @@ impl RequestContextInner { } } + fn set_user_agent(&mut self, user_agent: Option) { + self.user_agent = user_agent; + } + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 0537ae6a62..bfab5f34f9 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -8,7 +8,7 @@ use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::basic::Compression; use parquet::file::metadata::RowGroupMetaDataPtr; -use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr}; use parquet::file::writer::SerializedFileWriter; use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; @@ -17,10 +17,10 @@ use serde::ser::SerializeMap; use tokio::sync::mpsc; use tokio::time; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, Span}; +use tracing::{Span, debug, info}; use utils::backoff; -use super::{RequestContextInner, LOG_CHAN}; +use super::{LOG_CHAN, RequestContextInner}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; use crate::ext::TaskExt; @@ -82,6 +82,7 @@ pub(crate) struct RequestData { peer_addr: String, username: Option, application_name: Option, + user_agent: Option, endpoint_id: Option, database: Option, project: Option, @@ -128,6 +129,7 @@ impl From<&RequestContextInner> for RequestData { timestamp: value.first_packet.naive_utc(), username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), + user_agent: value.user_agent.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), @@ -425,20 +427,20 @@ mod tests { use futures::{Stream, StreamExt}; use itertools::Itertools; use parquet::basic::{Compression, ZstdLevel}; - use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties}; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use tokio::sync::mpsc; use tokio::time; use walkdir::WalkDir; - use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; + use super::{ParquetConfig, ParquetUploadArgs, RequestData, worker_inner}; #[derive(Parser)] struct ProxyCliArgs { @@ -514,26 +516,27 @@ mod tests { fn generate_request_data(rng: &mut impl Rng) -> RequestData { RequestData { - session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), - peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), + session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(), + peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(), timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) .unwrap() .naive_utc(), application_name: Some("test".to_owned()), - username: Some(hex::encode(rng.gen::<[u8; 4]>())), - endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), - database: Some(hex::encode(rng.gen::<[u8; 16]>())), - project: Some(hex::encode(rng.gen::<[u8; 16]>())), - branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + user_agent: Some("test-user-agent".to_owned()), + username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), + endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + project: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())), pg_options: None, auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, - success: rng.gen(), + success: rng.r#gen(), cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), disconnect_timestamp: None, @@ -610,15 +613,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -650,11 +653,11 @@ mod tests { assert_eq!( file_stats, [ - (1204324, 5, 10000), - (1204048, 5, 10000), - (1204349, 5, 10000), - (1204334, 5, 10000), - (1204588, 5, 10000) + (1205810, 5, 10000), + (1205534, 5, 10000), + (1205835, 5, 10000), + (1205820, 5, 10000), + (1206074, 5, 10000) ] ); @@ -679,15 +682,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -724,7 +727,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] + [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index ef6621fc59..977fcf4727 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -3,16 +3,16 @@ use std::sync::Arc; use std::time::Duration; -use ::http::header::AUTHORIZATION; use ::http::HeaderName; +use ::http::header::AUTHORIZATION; use futures::TryFutureExt; use postgres_client::config::SslMode; use tokio::time::Instant; -use tracing::{debug, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, warn}; use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; -use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 1e6cde8fb0..7da5464aa5 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -6,11 +6,11 @@ use std::sync::Arc; use futures::TryFutureExt; use thiserror::Error; use tokio_postgres::Client; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; -use crate::auth::backend::jwt::AuthRule; -use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; use crate::control_plane::client::{ diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index c28ff4789d..746595de38 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -10,15 +10,15 @@ use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; -use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{ - errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, - CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, + CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, + CachedRoleSecret, ControlPlaneApi, NodeInfoCache, errors, }; use crate::error::ReportableError; use crate::metrics::ApiLockMetrics; diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index d6f565e34a..bc30cffd27 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -2,7 +2,7 @@ use thiserror::Error; use crate::control_plane::client::ApiLockError; use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason}; -use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; +use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error}; use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2f7359240d..df31abcc8c 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -6,7 +6,7 @@ use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use crate::control_plane::messages::{DatabaseInfo, KickSession}; use crate::waiters::{self, Waiter, Waiters}; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 89ec4f9b33..d592223be1 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -11,9 +11,9 @@ pub(crate) mod errors; use std::sync::Arc; +use crate::auth::IpPattern; use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 141f319567..5278fe2a3e 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -9,8 +9,8 @@ use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; -use measured::text::BufferedTextEncoder; use measured::MetricGroup; +use measured::text::BufferedTextEncoder; use metrics::NeonMetrics; use tracing::{info, info_span}; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index ed88c77256..96f600d836 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -13,8 +13,8 @@ use hyper::body::Body; pub(crate) use reqwest::{Request, Response}; use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::policies::ExponentialBackoff; pub(crate) use reqwest_retry::RetryTransientMiddleware; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; use thiserror::Error; use crate::metrics::{ConsoleRequest, Metrics}; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index fbd4811b54..3c34918d84 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -8,7 +8,7 @@ use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; use tracing::subscriber::Interest; -use tracing::{callsite, span, Event, Metadata, Span, Subscriber}; +use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f3447e063e..db1f096de1 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -543,11 +543,7 @@ impl Drop for LatencyTimer { impl From for Bool { fn from(value: bool) -> Self { - if value { - Bool::True - } else { - Bool::False - } + if value { Bool::True } else { Bool::False } } } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 74a15d9bf4..41180fa6c1 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -9,6 +9,7 @@ use std::task::{Context, Poll}; use bytes::{Buf, Bytes, BytesMut}; use pin_project_lite::pin_project; +use smol_str::SmolStr; use strum_macros::FromRepr; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; use zerocopy::{FromBytes, FromZeroes}; @@ -99,7 +100,7 @@ impl fmt::Display for ConnectionInfo { #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectionInfoExtra { - Aws { vpce_id: Bytes }, + Aws { vpce_id: SmolStr }, Azure { link_id: u32 }, } @@ -193,7 +194,7 @@ fn process_proxy_payload( return Err(io::Error::new( io::ErrorKind::Other, "invalid proxy protocol address family/transport protocol.", - )) + )); } }; @@ -207,9 +208,14 @@ fn process_proxy_payload( } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { - Some(Pp2AwsType::VpceId) => { - extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value }); - } + Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) { + Ok(s) => { + extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); + } + Err(e) => { + tracing::warn!("invalid aws vpce id: {e}"); + } + }, None => { tracing::warn!("unknown aws tlv: subtype={subtype}"); } @@ -401,7 +407,7 @@ mod tests { use tokio::io::AsyncReadExt; use crate::protocol2::{ - read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, + ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, read_proxy_protocol, }; #[tokio::test] diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 26fb1754bf..b8b39fa121 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -5,7 +5,7 @@ use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; @@ -15,7 +15,7 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; -use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::retry::{CouldRetry, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 861f1766e8..6f8b972348 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,7 +1,7 @@ use std::future::poll_fn; use std::io; use std::pin::Pin; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tracing::info; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 2a406fcb34..0c6d352600 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -9,28 +9,28 @@ pub(crate) mod retry; pub(crate) mod wake_compute; use std::sync::Arc; -pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; +pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams}; use regex::Regex; use serde::{Deserialize, Serialize}; -use smol_str::{format_smolstr, SmolStr}; +use smol_str::{SmolStr, ToSmolStr, format_smolstr}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn, Instrument}; +use tracing::{Instrument, debug, error, info, warn}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::connect_compute::{TcpMechanism, connect_to_compute}; use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -100,22 +100,34 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { warn!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { warn!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } } @@ -156,10 +168,16 @@ pub async fn task_main( match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + warn!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -374,9 +392,16 @@ pub(crate) async fn handle_client( let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; + let private_link_id = match ctx.extra() { + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + None => None, + }; + Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id, compute: node, session_id: ctx.session_id(), cancel: session, diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 08871380d6..23b9897155 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,3 +1,4 @@ +use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; @@ -9,7 +10,7 @@ use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -17,10 +18,14 @@ pub(crate) async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, + private_link_id: Option, ) -> Result<(), ErrorSource> { - let usage = USAGE_METRICS.register(Ids { + // we will report ingress at a later date + let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction: TrafficDirection::Egress, + private_link_id, }); let metrics = &Metrics::get().proxy.io_bytes; @@ -31,7 +36,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes we sent to the client (outbound). metrics.get_metric(m_sent).inc_by(cnt as u64); - usage.record_egress(cnt as u64); + usage_tx.record_egress(cnt as u64); }, ); @@ -61,6 +66,7 @@ pub(crate) struct ProxyPassthrough { pub(crate) compute: PostgresConnection, pub(crate) aux: MetricsAuxInfo, pub(crate) session_id: uuid::Uuid, + pub(crate) private_link_id: Option, pub(crate) cancel: cancellation::Session, pub(crate) _req: NumConnectionRequestsGuard<'static>, @@ -72,7 +78,13 @@ impl ProxyPassthrough { self, compute_config: &ComputeConfig, ) -> Result<(), ErrorSource> { - let res = proxy_pass(self.client, self.compute.stream, self.aux).await; + let res = proxy_pass( + self.client, + self.compute.stream, + self.aux, + self.private_link_id, + ) + .await; if let Err(err) = self .compute .cancel_closure diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index d8c00a9b41..171f539b1e 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -5,12 +5,12 @@ mod mitm; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; -use retry::{retry_after, ShouldRetryWakeCompute}; +use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; @@ -334,8 +334,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::distributions::Alphanumeric; use rand::Rng; + use rand::distributions::Alphanumeric; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 4e9206feff..9d8915e24a 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -3,8 +3,8 @@ use tracing::{error, info}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestContext; -use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::control_plane::CachedNodeInfo; +use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 9645eaf725..b3853d48e4 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -3,7 +3,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use clashmap::ClashMap; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use tokio::time::Instant; use tracing::info; use utils::leaky_bucket::LeakyBucketState; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index b74a9ab17e..f8eeb89f05 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -5,8 +5,8 @@ use std::time::Duration; use parking_lot::Mutex; use tokio::sync::Notify; -use tokio::time::error::Elapsed; use tokio::time::Instant; +use tokio::time::error::Elapsed; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index ef6c39f230..71e2a92da6 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::bail; use clashmap::ClashMap; diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index bf6dde9332..58e3c889a7 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; +use aws_config::Region; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -8,7 +9,6 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; -use aws_config::Region; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index dcb9a59f87..7527bca6d0 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -1,7 +1,7 @@ use std::io::ErrorKind; use anyhow::Ok; -use pq_proto::{id_to_cancel_key, CancelKeyData}; +use pq_proto::{CancelKeyData, id_to_cancel_key}; use serde::{Deserialize, Serialize}; pub mod keyspace { diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index ac77556566..46e6a439e5 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -5,8 +5,8 @@ use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -use super::messages::ServerMessage; use super::Mechanism; +use super::messages::ServerMessage; use crate::stream::PqStream; /// Abstracts away all peculiarities of the libpq's protocol. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 87ab6e0d5f..9d56c465ec 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -90,7 +90,7 @@ mod tests { // number of insert operations let m = rng.gen_range(1..100); - let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + let id = uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(); ids.push((id, n, m)); // N = sum(actual) diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 77853db3db..abd5aeae5b 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -5,6 +5,7 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; use sha2::Sha256; +use super::ScramKey; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; @@ -12,7 +13,6 @@ use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; use super::threadpool::ThreadPool; -use super::ScramKey; use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -208,8 +208,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use sasl::Step; use ExchangeState; + use sasl::Step; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 0e54e7ded9..7b0b861ce9 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -4,7 +4,7 @@ use std::fmt; use std::ops::Range; use super::base64_decode_array; -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index cfa571cbe1..24f991d4d9 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -15,7 +15,7 @@ mod secret; mod signature; pub mod threadpool; -pub(crate) use exchange::{exchange, Exchange}; +pub(crate) use exchange::{Exchange, exchange}; use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs index d3255cf2ca..a5b1c3e9f4 100644 --- a/proxy/src/scram/signature.rs +++ b/proxy/src/scram/signature.rs @@ -1,6 +1,6 @@ //! Tools for client/server signature management. -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f35c375ba2..72029102e0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -7,27 +7,27 @@ use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; use rand::rngs::OsRng; -use tokio::net::{lookup_host, TcpStream}; +use tokio::net::{TcpStream, lookup_host}; use tracing::field::display; use tracing::{debug, info}; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send}; -use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; +use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; +use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; -use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::auth::{self, AuthError, check_peer_addr_is_in_list}; use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; use crate::config::{ComputeConfig, ProxyConfig}; use crate::context::RequestContext; +use crate::control_plane::CachedNodeInfo; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::CachedNodeInfo; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; use crate::protocol2::ConnectionInfoExtra; @@ -75,10 +75,7 @@ impl PoolingBackend { let extra = ctx.extra(); let incoming_endpoint_id = match extra { None => String::new(), - Some(ConnectionInfoExtra::Aws { vpce_id }) => { - // Convert the vcpe_id to a string - String::from_utf8(vpce_id.to_vec()).unwrap_or_default() - } + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), }; diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 6db986f1f7..ba8945afc5 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -6,7 +6,7 @@ use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use rustc_hash::FxHasher; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -40,7 +40,7 @@ impl CancelSet { pub(crate) fn take(&self) -> Option { for _ in 0..4 { - if let Some(token) = self.take_raw(thread_rng().gen()) { + if let Some(token) = self.take_raw(thread_rng().r#gen()) { return Some(token); } tracing::trace!("failed to get cancel token"); @@ -68,7 +68,7 @@ impl CancelShard { fn take(&mut self, rng: usize) -> Option { NonZeroUsize::new(self.tokens.len()).and_then(|len| { // 10 second grace period so we don't cancel new connections - if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + if self.tokens.get_index(rng % len)?.1.0.elapsed() < Duration::from_secs(10) { return None; } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 447103edce..6a9089fc2a 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,17 +1,17 @@ use std::fmt; use std::pin::pin; use std::sync::{Arc, Weak}; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; -use futures::future::poll_fn; use futures::Future; -use postgres_client::tls::NoTlsStream; +use futures::future::poll_fn; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; #[cfg(test)] use { super::conn_pool_lib::GlobalConnPoolOptions, diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index a300198de4..933204994b 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -9,7 +9,8 @@ use clashmap::ClashMap; use parking_lot::RwLock; use postgres_client::ReadyForQueryStatus; use rand::Rng; -use tracing::{debug, info, Span}; +use smol_str::ToSmolStr; +use tracing::{Span, debug, info}; use super::backend::HttpConnError; use super::conn_pool::ClientDataRemote; @@ -19,8 +20,9 @@ use crate::auth::backend::ComputeUserInfo; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -473,7 +475,9 @@ where .http_pool_opened_connections .get_metric() .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + info!( + "pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}" + ); } let removed = current_len - new_len; @@ -635,15 +639,28 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics(&self) -> Arc { + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { let aux = &self .inner .as_ref() .expect("client inner should not be removed") .aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } } @@ -700,7 +717,9 @@ impl Discard<'_, C> { pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + info!( + "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state" + ); } } } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index fde38d0de3..338a79b4b3 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -5,8 +5,9 @@ use std::sync::{Arc, Weak}; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; +use smol_str::ToSmolStr; use tokio::net::TcpStream; -use tracing::{debug, error, info, info_span, Instrument}; +use tracing::{Instrument, debug, error, info, info_span}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -16,8 +17,9 @@ use super::conn_pool_lib::{ use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = @@ -264,11 +266,24 @@ impl Client { Self { inner } } - pub(crate) fn metrics(&self) -> Arc { + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { let aux = &self.inner.aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index ab012bd020..fbd12ad9cb 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,5 +1,5 @@ -use postgres_client::types::{Kind, Type}; use postgres_client::Row; +use postgres_client::types::{Kind, Type}; use serde_json::{Map, Value}; // diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 137a2d6377..8426a0810e 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -11,24 +11,24 @@ use std::collections::HashMap; use std::pin::pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::sync::atomic::AtomicUsize; +use std::task::{Poll, ready}; use std::time::Duration; use ed25519_dalek::{Signature, Signer, SigningKey}; -use futures::future::poll_fn; use futures::Future; +use futures::future::poll_fn; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; -use postgres_client::tls::NoTlsStream; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use serde_json::value::RawValue; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, warn}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -389,6 +389,9 @@ mod tests { // }); // println!("{}", serde_json::to_string(&jwk).unwrap()); - assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg"); + assert_eq!( + jwt, + "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg" + ); } } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 8289500159..acd6a05718 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -15,7 +15,7 @@ mod sql_over_http; mod websocket; use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; +use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; @@ -23,8 +23,8 @@ use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool_lib::GlobalConnPoolOptions; -use futures::future::{select, Either}; use futures::TryFutureExt; +use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; @@ -32,23 +32,23 @@ use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; -use rand::rngs::StdRng; use rand::SeedableRng; -use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID}; +use rand::rngs::StdRng; +use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; -use tracing::{info, warn, Instrument}; +use tracing::{Instrument, info, warn}; use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; -use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; +use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; @@ -438,6 +438,14 @@ async fn request_handler( &config.region, ); + ctx.set_user_agent( + request + .headers() + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5982fe225d..93dd531f70 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,23 +2,23 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::{select, try_join, Either}; +use futures::future::{Either, select, try_join}; use futures::{StreamExt, TryFutureExt}; -use http::header::AUTHORIZATION; use http::Method; +use http::header::AUTHORIZATION; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; -use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use hyper::{HeaderMap, Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; -use serde_json::value::RawValue; use serde_json::Value; +use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; @@ -31,18 +31,18 @@ use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool_lib::{self, ConnInfo}; use super::error::HttpCodeError; use super::http_util::json_response; -use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::auth::{ComputeUserInfoParseError, endpoint_sni}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::http::{read_body_with_limit, ReadBodyError}; +use crate::http::{ReadBodyError, read_body_with_limit}; use crate::metrics::{HttpDirection, Metrics}; -use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -209,7 +209,7 @@ fn get_conn_info( } } Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { - return Err(ConnInfoError::MissingHostname) + return Err(ConnInfoError::MissingHostname); } }; ctx.set_endpoint_id(endpoint.clone()); @@ -228,6 +228,13 @@ fn get_conn_info( } } + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let user_info = ComputeUserInfo { endpoint, user: username, @@ -745,7 +752,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(); + let metrics = client.metrics(TrafficDirection::Egress, ctx); let len = json_output.len(); let response = response @@ -818,7 +825,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(); + let _metrics = client.metrics(TrafficDirection::Egress, ctx); Ok(client .inner @@ -1021,7 +1028,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize + use), SqlOverHttpError> { let query_start = Instant::now(); let query_params = data.params; @@ -1118,10 +1125,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self) -> Arc { + fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(), - Client::Local(local_client) => local_client.metrics(), + Client::Remote(client) => client.metrics(direction, ctx), + Client::Local(local_client) => local_client.metrics(direction, ctx), } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 585a7d63b2..c4baeeb5cc 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,6 +1,6 @@ use std::pin::Pin; use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -15,9 +15,9 @@ use tracing::warn; use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; -use crate::error::{io_error, ReportableError}; +use crate::error::{ReportableError, io_error}; use crate::metrics::Metrics; -use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::proxy::{ClientMode, ErrorSource, handle_client}; use crate::rate_limiter::EndpointRateLimiter; pin_project! { @@ -184,11 +184,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex}; use tokio::task::JoinSet; - use tokio_tungstenite::tungstenite::protocol::Role; - use tokio_tungstenite::tungstenite::Message; use tokio_tungstenite::WebSocketStream; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::tungstenite::protocol::Role; use super::WebSocketRw; diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs index 0b675683c0..32b2344a1c 100644 --- a/proxy/src/signals.rs +++ b/proxy/src/signals.rs @@ -12,7 +12,7 @@ pub async fn handle( where F: FnMut(), { - use tokio::signal::unix::{signal, SignalKind}; + use tokio::signal::unix::{SignalKind, signal}; let mut hangup = signal(SignalKind::hangup())?; let mut interrupt = signal(SignalKind::interrupt())?; diff --git a/proxy/src/tls/postgres_rustls.rs b/proxy/src/tls/postgres_rustls.rs index 0ad279b635..f09e916a1d 100644 --- a/proxy/src/tls/postgres_rustls.rs +++ b/proxy/src/tls/postgres_rustls.rs @@ -2,8 +2,8 @@ use std::convert::TryFrom; use std::sync::Arc; use postgres_client::tls::MakeTlsConnect; -use rustls::pki_types::ServerName; use rustls::ClientConfig; +use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite}; mod private { @@ -15,8 +15,8 @@ mod private { use postgres_client::tls::{ChannelBinding, TlsConnect}; use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; - use tokio_rustls::client::TlsStream; use tokio_rustls::TlsConnector; + use tokio_rustls::client::TlsStream; use crate::tls::TlsServerEndPoint; diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 2cc1657eea..903c0b712b 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,12 +1,12 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; -use super::{TlsServerEndPoint, PG_ALPN_PROTOCOL}; +use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; pub struct TlsConfig { pub config: Arc, diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d369e3742f..004d268fa1 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -2,20 +2,21 @@ //! and push them to a HTTP endpoint. use std::borrow::Cow; use std::convert::Infallible; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; -use clashmap::mapref::entry::Entry; use clashmap::ClashMap; -use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; +use clashmap::mapref::entry::Entry; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, EventType, idempotency_key}; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; @@ -43,6 +44,33 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, + pub(crate) direction: TrafficDirection, + #[serde(with = "none_as_empty_string")] + pub(crate) private_link_id: Option, +} + +mod none_as_empty_string { + use serde::Deserialize; + use smol_str::SmolStr; + + #[allow(clippy::ref_option)] + pub fn serialize(t: &Option, s: S) -> Result { + s.serialize_str(t.as_deref().unwrap_or("")) + } + + pub fn deserialize<'de, D: serde::Deserializer<'de>>( + d: D, + ) -> Result, D::Error> { + let s = SmolStr::deserialize(d)?; + if s.is_empty() { Ok(None) } else { Ok(Some(s)) } + } +} + +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "lowercase")] +pub(crate) enum TrafficDirection { + Ingress, + Egress, } pub(crate) trait MetricCounterRecorder { @@ -505,6 +533,8 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), + direction: TrafficDirection::Egress, + private_link_id: None, }); // the counter should be observed despite 0 egress diff --git a/pyproject.toml b/pyproject.toml index 92a660c233..c6e5073bcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" -allure-pytest = "^2.13.2" +allure-pytest = "^2.13.5" pytest-asyncio = "^0.21.0" toml = "^0.10.2" psutil = "^5.9.4" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 38a7f202ba..591d60ea79 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.84.1" +channel = "1.85.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index d12ebc1030..bb937ad56a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -31,7 +31,6 @@ futures.workspace = true once_cell.workspace = true parking_lot.workspace = true pageserver_api.workspace = true -postgres.workspace = true postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 19c6662e74..122630d953 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -4,7 +4,7 @@ use std::io::Write as _; use bytes::BytesMut; use camino_tempfile::tempfile; -use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; +use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use pprof::criterion::{Output, PProfProfiler}; @@ -13,6 +13,7 @@ use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; use safekeeper::test_utils::Env; +use safekeeper_api::membership::SafekeeperGeneration as Generation; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -26,7 +27,7 @@ const GB: usize = 1024 * MB; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. @@ -88,13 +89,12 @@ fn bench_process_msg(c: &mut Criterion) { let (lsn, record) = walgen.next().expect("endless WAL"); ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -160,13 +160,12 @@ fn bench_wal_acceptor(c: &mut Criterion) { .take(n) .map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -262,13 +261,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6cc53e0d23..10fc4a4b59 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,52 +1,41 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use clap::{ArgAction, Parser}; -use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use futures::{FutureExt, StreamExt}; -use remote_storage::RemoteStorageConfig; -use sd_notify::NotifyState; -use tokio::runtime::Handle; -use tokio::signal::unix::{signal, SignalKind}; -use tokio::task::JoinError; -use utils::logging::SecretString; - -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; -use storage_broker::Uri; - -use tracing::*; -use utils::pid_file; +use anyhow::{Context, Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::{ArgAction, Parser}; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; +use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::http; -use safekeeper::wal_service; -use safekeeper::GlobalTimelines; -use safekeeper::SafeKeeperConf; -use safekeeper::{broker, WAL_SERVICE_RUNTIME}; -use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{wal_backup, HTTP_RUNTIME}; -use storage_broker::DEFAULT_ENDPOINT; -use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; -use utils::{ - id::NodeId, - logging::{self, LogFormat}, - project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, +use safekeeper::{ + BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, + control_file, http, wal_backup, wal_service, }; +use sd_notify::NotifyState; +use storage_broker::{DEFAULT_ENDPOINT, Uri}; +use tokio::runtime::Handle; +use tokio::signal::unix::{SignalKind, signal}; +use tokio::task::JoinError; +use tracing::*; +use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; +use utils::id::NodeId; +use utils::logging::{self, LogFormat, SecretString}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -55,7 +44,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 4b091e2c29..de6e275124 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,39 +1,25 @@ //! Communication with the broker, providing safekeeper peers and pageserver coordination. -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; - -use anyhow::Error; -use anyhow::Result; - -use storage_broker::parse_proto_ttid; - -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::FilterTenantTimelineId; -use storage_broker::proto::MessageType; -use storage_broker::proto::SafekeeperDiscoveryResponse; -use storage_broker::proto::SubscribeByFilterRequest; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; -use storage_broker::proto::TypeSubscription; -use storage_broker::proto::TypedMessage; -use storage_broker::Request; - -use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; -use std::time::UNIX_EPOCH; +use std::sync::atomic::AtomicU64; +use std::time::{Duration, Instant, UNIX_EPOCH}; + +use anyhow::{Context, Error, Result, anyhow, bail}; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, + SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage, +}; +use storage_broker::{Request, parse_proto_ttid}; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; -use crate::metrics::BROKER_ITERATION_TIMELINES; -use crate::metrics::BROKER_PULLED_UPDATES; -use crate::metrics::BROKER_PUSHED_UPDATES; -use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::metrics::{ + BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS, + BROKER_PUSHED_UPDATES, +}; +use crate::{GlobalTimelines, SafeKeeperConf}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 35aebfd8ad..1bf3e4cac1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -1,24 +1,23 @@ //! Control file serialization, deserialization and persistence. -use anyhow::{bail, ensure, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::{Utf8Path, Utf8PathBuf}; -use safekeeper_api::membership::INVALID_GENERATION; -use tokio::fs::File; -use tokio::io::AsyncWriteExt; -use utils::crashsafe::durable_rename; - use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v10_to_v9; -use crate::control_file_upgrade::upgrade_control_file; +use anyhow::{Context, Result, bail, ensure}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; +use tokio::fs::File; +use tokio::io::AsyncWriteExt; +use utils::bin_ser::LeSer; +use utils::crashsafe::durable_rename; + +use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file}; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; -use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 10; @@ -234,11 +233,12 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::*; use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; + use super::*; + const NO_SYNC: bool = true; #[tokio::test] diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 904e79f976..1ad9e62f9b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,24 +1,19 @@ //! Code to deal with safekeeper control file upgrades use std::vec; -use crate::{ - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, - state::{EvictionState, TimelinePersistentState}, - wal_backup_partial, -}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use pq_proto::SystemId; -use safekeeper_api::{ - membership::{Configuration, INVALID_GENERATION}, - ServerInfo, Term, -}; +use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::wal_backup_partial; /// Persistent consensus state of the acceptor. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -552,11 +547,11 @@ pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersisten mod tests { use std::str::FromStr; - use utils::{id::NodeId, Hex}; - - use crate::control_file_upgrade::PersistedPeerInfo; + use utils::Hex; + use utils::id::NodeId; use super::*; + use crate::control_file_upgrade::PersistedPeerInfo; #[test] fn roundtrip_v1() { diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 10a761e1f5..11daff22cb 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,24 +1,22 @@ -use anyhow::{bail, Result}; +use std::sync::Arc; + +use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; use safekeeper_api::membership::Configuration; -use std::sync::Arc; -use tokio::{ - fs::OpenOptions, - io::{AsyncSeekExt, AsyncWriteExt}, -}; +use tokio::fs::OpenOptions; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tracing::{info, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::{ - control_file::FileStorage, - state::TimelinePersistentState, - timeline::{TimelineError, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup::copy_s3_segments, - wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::control_file::FileStorage; +use crate::state::TimelinePersistentState; +use crate::timeline::{TimelineError, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_backup::copy_s3_segments; +use crate::wal_storage::{WalReader, wal_file_paths}; // we don't want to have more than 10 segments on disk after copy, because they take space const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 19362a0992..68a38e1498 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -2,37 +2,25 @@ use std::fs; use std::fs::DirEntry; -use std::io::BufReader; -use std::io::Read; +use std::io::{BufReader, Read}; use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; -use anyhow::Result; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use anyhow::{Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::{DateTime, Utc}; -use postgres_ffi::XLogSegNo; -use postgres_ffi::MAX_SEND_SIZE; -use safekeeper_api::models::WalSenderState; -use serde::Deserialize; -use serde::Serialize; - use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName}; +use postgres_ffi::{MAX_SEND_SIZE, XLogSegNo}; +use safekeeper_api::models::WalSenderState; +use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use utils::id::NodeId; -use utils::id::TenantTimelineId; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::state::TimelineMemState; -use crate::state::TimelinePersistentState; -use crate::timeline::get_timeline_dir; -use crate::timeline::WalResidentTimeline; -use crate::timeline_manager; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timeline::{WalResidentTimeline, get_timeline_dir}; +use crate::{GlobalTimelines, SafeKeeperConf, timeline_manager}; /// Various filters that influence the resulting JSON output. #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index e77eeb4130..5ca3d1b7c2 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,35 +1,31 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use std::future::Future; +use std::str::{self, FromStr}; +use std::sync::Arc; + use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use safekeeper_api::models::ConnectionId; +use postgres_backend::{PostgresBackend, QueryError}; +use postgres_ffi::PG_TLI; +use pq_proto::{BeMessage, FeStartupPacket, INT4_OID, RowDescriptor, TEXT_OID}; +use regex::Regex; use safekeeper_api::Term; -use std::future::Future; -use std::str::{self, FromStr}; -use std::sync::Arc; +use safekeeper_api::models::ConnectionId; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{debug, info, info_span, Instrument}; +use tracing::{Instrument, debug, info, info_span}; +use utils::auth::{Claims, JwtAuth, Scope}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; -use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; - -use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; +use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics}; use crate::timeline::TimelineError; use crate::{GlobalTimelines, SafeKeeperConf}; -use postgres_backend::PostgresBackend; -use postgres_backend::QueryError; -use postgres_ffi::PG_TLI; -use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use regex::Regex; -use utils::auth::{Claims, JwtAuth, Scope}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { @@ -65,9 +61,6 @@ enum SafekeeperPostgresCommand { }, IdentifySystem, TimelineStatus, - JSONCtrl { - cmd: AppendLogicalMessage, - }, } fn parse_cmd(cmd: &str) -> anyhow::Result { @@ -137,11 +130,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::IdentifySystem) } else if cmd.starts_with("TIMELINE_STATUS") { Ok(SafekeeperPostgresCommand::TimelineStatus) - } else if cmd.starts_with("JSON_CTRL") { - let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; - Ok(SafekeeperPostgresCommand::JSONCtrl { - cmd: serde_json::from_str(cmd)?, - }) } else { anyhow::bail!("unsupported command {cmd}"); } @@ -153,7 +141,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", - SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", } } @@ -362,9 +349,6 @@ impl postgres_backend::Handler } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } } }) } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 6e160b7a5e..f162985ef7 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,9 +1,9 @@ pub mod routes; -pub use routes::make_router; - -pub use safekeeper_api::models; use std::sync::Arc; +pub use routes::make_router; +pub use safekeeper_api::models; + use crate::{GlobalTimelines, SafeKeeperConf}; pub async fn task_main( diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index cd2ac5f44c..4f47331c85 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,51 +1,41 @@ -use http_utils::failpoints::failpoints_handler; -use hyper::{Body, Request, Response, StatusCode}; -use safekeeper_api::models; -use safekeeper_api::models::AcceptorStateStatus; -use safekeeper_api::models::PullTimelineRequest; -use safekeeper_api::models::SafekeeperStatus; -use safekeeper_api::models::TermSwitchApiEntry; -use safekeeper_api::models::TimelineStatus; -use safekeeper_api::ServerInfo; use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; + +use http_utils::endpoint::{ + self, ChannelWriter, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; +use hyper::{Body, Request, Response, StatusCode}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::{ + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, + TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, +}; +use safekeeper_api::{ServerInfo, membership, models}; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::sync::mpsc; use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; - -use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, ChannelWriter}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, -}; - -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; -use utils::{ - auth::SwappableJwtAuth, - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use tracing::{Instrument, info_span}; +use utils::auth::SwappableJwtAuth; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; -use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; +use crate::timelines_global_map::{DeleteOrExclude, TimelineDeleteResult}; +use crate::{ + GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, +}; /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { @@ -83,10 +73,13 @@ async fn tenant_delete_handler(mut request: Request) -> Result) -> Result>(), + .collect::>(), ) } @@ -218,12 +211,15 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result for ApiError { + fn from(de: DeleteOrExcludeError) -> ApiError { + match de { + DeleteOrExcludeError::Conflict { + requested: _, + current: _, + } => ApiError::Conflict(de.to_string()), + DeleteOrExcludeError::Other(e) => ApiError::InternalServerError(e), + } + } +} + +/// Remove timeline locally after this node has been excluded from the +/// membership configuration. The body is the same as in the membership endpoint +/// -- conf where node is excluded -- and in principle single ep could be used +/// for both actions, but since this is a data deletion op let's keep them +/// separate. +async fn timeline_exclude_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request doesn't exclude us, membership switch endpoint should be used + // instead. + if data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is member of it", + data.mconf, my_id + ))); + } + let action = DeleteOrExclude::Exclude(data.mconf); + + let resp = global_timelines + .delete_or_exclude(&ttid, action) + .await + .map_err(ApiError::from)?; + json_response(StatusCode::OK, resp) +} + /// Consider switching timeline membership configuration to the provided one. async fn timeline_membership_handler( mut request: Request, @@ -291,12 +345,29 @@ async fn timeline_membership_handler( let tli = global_timelines.get(ttid).map_err(ApiError::from)?; let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request excludes us, exclude endpoint should be used instead. + if !data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is not a member of it", + data.mconf, my_id + ))); + } + let req_gen = data.mconf.generation; let response = tli .membership_switch(data.mconf) .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + // Return 409 if request was ignored. + if req_gen == response.current_conf.generation { + json_response(StatusCode::OK, response) + } else { + Err(ApiError::Conflict(format!( + "request to switch into {} ignored, current generation {}", + req_gen, response.current_conf.generation + ))) + } } async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { @@ -647,11 +718,14 @@ pub fn make_router( .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/exclude", |r| { + request_span(r, timeline_exclude_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) - .post( + .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", |r| request_span(r, timeline_membership_handler), ) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs deleted file mode 100644 index 19e17c4a75..0000000000 --- a/safekeeper/src/json_ctrl.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! -//! This module implements JSON_CTRL protocol, which allows exchange -//! JSON messages over psql for testing purposes. -//! -//! Currently supports AppendLogicalMessage, which is used for WAL -//! modifications in tests. -//! - -use anyhow::Context; -use postgres_backend::QueryError; -use safekeeper_api::membership::Configuration; -use safekeeper_api::{ServerInfo, Term}; -use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::*; - -use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; -use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, -}; -use crate::safekeeper::{TermHistory, TermLsn}; -use crate::state::TimelinePersistentState; -use crate::timeline::WalResidentTimeline; -use postgres_backend::PostgresBackend; -use postgres_ffi::encode_logical_message; -use postgres_ffi::WAL_SEGMENT_SIZE; -use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use utils::lsn::Lsn; - -#[derive(Serialize, Deserialize, Debug)] -pub struct AppendLogicalMessage { - // prefix and message to build LogicalMessage - pub lm_prefix: String, - pub lm_message: String, - - // if true, commit_lsn will match flush_lsn after append - pub set_commit_lsn: bool, - - // if true, ProposerElected will be sent before append - pub send_proposer_elected: bool, - - // fields from AppendRequestHeader - pub term: Term, - #[serde(with = "utils::lsn::serde_as_u64")] - pub epoch_start_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub begin_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub truncate_lsn: Lsn, - pub pg_version: u32, -} - -#[derive(Debug, Serialize)] -struct AppendResult { - // safekeeper state after append - state: TimelinePersistentState, - // info about new record in the WAL - inserted_wal: InsertedWAL, -} - -/// Handles command to craft logical message WAL record with given -/// content, and then append it with specified term and lsn. This -/// function is used to test safekeepers in different scenarios. -pub async fn handle_json_ctrl( - spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - append_request: &AppendLogicalMessage, -) -> Result<(), QueryError> { - info!("JSON_CTRL request: {append_request:?}"); - - // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg, append_request.pg_version).await?; - - // if send_proposer_elected is true, we need to update local history - if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; - } - - let inserted_wal = append_logical_message(&tli, append_request).await?; - let response = AppendResult { - state: tli.get_state().await.1, - inserted_wal, - }; - let response_data = serde_json::to_vec(&response) - .with_context(|| format!("Response {response:?} is not a json array"))?; - - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { - name: b"json", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }]))? - .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?; - Ok(()) -} - -/// Prepare safekeeper to process append requests without crashes, -/// by sending ProposerGreeting with default server.wal_seg_size. -async fn prepare_safekeeper( - spg: &SafekeeperPostgresHandler, - pg_version: u32, -) -> anyhow::Result { - let tli = spg - .global_timelines - .create( - spg.ttid, - Configuration::empty(), - ServerInfo { - pg_version, - wal_seg_size: WAL_SEGMENT_SIZE as u32, - system_id: 0, - }, - Lsn::INVALID, - Lsn::INVALID, - ) - .await?; - - tli.wal_residence_guard().await -} - -async fn send_proposer_elected( - tli: &WalResidentTimeline, - term: Term, - lsn: Lsn, -) -> anyhow::Result<()> { - // add new term to existing history - let history = tli.get_state().await.1.acceptor_state.term_history; - let history = history.up_to(lsn.checked_sub(1u64).unwrap()); - let mut history_entries = history.0; - history_entries.push(TermLsn { term, lsn }); - let history = TermHistory(history_entries); - - let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { - term, - start_streaming_at: lsn, - term_history: history, - timeline_start_lsn: lsn, - }); - - tli.process_msg(&proposer_elected_request).await?; - Ok(()) -} - -#[derive(Debug, Serialize)] -pub struct InsertedWAL { - begin_lsn: Lsn, - pub end_lsn: Lsn, - append_response: AppendResponse, -} - -/// Extend local WAL with new LogicalMessage record. To do that, -/// create AppendRequest with new WAL and pass it to safekeeper. -pub async fn append_logical_message( - tli: &WalResidentTimeline, - msg: &AppendLogicalMessage, -) -> anyhow::Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().await.1; - - let begin_lsn = msg.begin_lsn; - let end_lsn = begin_lsn + wal_data.len() as u64; - - let commit_lsn = if msg.set_commit_lsn { - end_lsn - } else { - sk_state.commit_lsn - }; - - let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { - h: AppendRequestHeader { - term: msg.term, - term_start_lsn: begin_lsn, - begin_lsn, - end_lsn, - commit_lsn, - truncate_lsn: msg.truncate_lsn, - proposer_uuid: [0u8; 16], - }, - wal_data, - }); - - let response = tli.process_msg(&append_request).await?; - - let append_response = match response { - Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, - _ => anyhow::bail!("not AppendResponse"), - }; - - Ok(InsertedWAL { - begin_lsn, - end_lsn, - append_response, - }) -} diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index e0090c638a..de3b783508 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -2,15 +2,16 @@ extern crate hyper0 as hyper; +use std::time::Duration; + use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; -use tokio::runtime::Runtime; - -use std::time::Duration; use storage_broker::Uri; - -use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; +use tokio::runtime::Runtime; +use utils::auth::SwappableJwtAuth; +use utils::id::NodeId; +use utils::logging::SecretString; mod auth; pub mod broker; @@ -20,7 +21,6 @@ pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; -pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; @@ -48,6 +48,7 @@ pub mod test_utils; mod timelines_global_map; use std::sync::Arc; + pub use timelines_global_map::GlobalTimelines; use utils::auth::JwtAuth; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3ea9e3d674..cb21a5f6d2 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,30 +1,28 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::{ - sync::{Arc, RwLock}, - time::{Instant, SystemTime}, -}; +use std::sync::{Arc, RwLock}; +use std::time::{Instant, SystemTime}; use anyhow::Result; use futures::Future; +use metrics::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}; +use metrics::proto::MetricFamily; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, - pow2_buckets, - proto::MetricFamily, + DISK_FSYNC_SECONDS_BUCKETS, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, pow2_buckets, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, + register_int_gauge_vec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - state::{TimelineMemState, TimelinePersistentState}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::receive_wal::MSG_QUEUE_SIZE; +use crate::state::{TimelineMemState, TimelinePersistentState}; // Global metrics across all timelines. pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs index 2136d1b5f7..efdbd9b3d7 100644 --- a/safekeeper/src/patch_control_file.rs +++ b/safekeeper/src/patch_control_file.rs @@ -4,7 +4,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use tracing::info; -use crate::{state::TimelinePersistentState, timeline::Timeline}; +use crate::state::TimelinePersistentState; +use crate::timeline::Timeline; #[derive(Deserialize, Debug, Clone)] pub struct Request { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 4827b73074..fc58b8509a 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,46 +1,38 @@ -use anyhow::{anyhow, bail, Context, Result}; +use std::cmp::min; +use std::io::{self, ErrorKind}; +use std::sync::Arc; + +use anyhow::{Context, Result, anyhow, bail}; use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{ - models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, - Term, -}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use safekeeper_api::Term; +use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; use serde::Deserialize; -use std::{ - cmp::min, - io::{self, ErrorKind}, - sync::Arc, -}; -use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio::fs::OpenOptions; +use tokio::io::AsyncWrite; +use tokio::sync::mpsc; +use tokio::task; use tokio_tar::{Archive, Builder, Header}; -use tokio_util::{ - io::{CopyToBytes, SinkWriter}, - sync::PollSender, -}; +use tokio_util::io::{CopyToBytes, SinkWriter}; +use tokio_util::sync::PollSender; use tracing::{error, info, instrument}; +use utils::crashsafe::fsync_async_opt; +use utils::id::{NodeId, TenantTimelineId}; +use utils::logging::SecretString; +use utils::lsn::Lsn; +use utils::pausable_failpoint; -use crate::{ - control_file::CONTROL_FILE_NAME, - debug_dump, - state::{EvictionState, TimelinePersistentState}, - timeline::{Timeline, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup, - wal_storage::open_wal_file, - GlobalTimelines, -}; -use utils::{ - crashsafe::fsync_async_opt, - id::{NodeId, TenantTimelineId}, - logging::SecretString, - lsn::Lsn, - pausable_failpoint, -}; +use crate::control_file::CONTROL_FILE_NAME; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_storage::open_wal_file; +use crate::{GlobalTimelines, debug_dump, wal_backup}; /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] @@ -374,8 +366,13 @@ impl WalResidentTimeline { // change, but as long as older history is strictly part of new that's // fine), but there is no need to do it. if bctx.term != term || bctx.last_log_term != last_log_term { - bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", - bctx.term, bctx.last_log_term, term, last_log_term); + bail!( + "term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, + bctx.last_log_term, + term, + last_log_term + ); } Ok(()) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index cb42f6f414..7967acde3f 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,35 +2,21 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::{ - WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, - WAL_RECEIVER_QUEUE_SIZE_TOTAL, -}; -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; -use crate::timeline::WalResidentTimeline; -use crate::GlobalTimelines; -use anyhow::{anyhow, Context}; -use bytes::BytesMut; -use parking_lot::MappedMutexGuard; -use parking_lot::Mutex; -use parking_lot::MutexGuard; -use postgres_backend::CopyStreamHandlerEnd; -use postgres_backend::PostgresBackend; -use postgres_backend::PostgresBackendReader; -use postgres_backend::QueryError; -use pq_proto::BeMessage; -use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; -use safekeeper_api::ServerInfo; use std::future; use std::net::SocketAddr; use std::sync::Arc; -use tokio::io::AsyncRead; -use tokio::io::AsyncWrite; + +use anyhow::{Context, anyhow}; +use bytes::BytesMut; +use parking_lot::{MappedMutexGuard, Mutex, MutexGuard}; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use pq_proto::BeMessage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendTimeoutError; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tokio::sync::mpsc::{Receiver, Sender, channel}; use tokio::task; use tokio::task::JoinHandle; use tokio::time::{Duration, Instant, MissedTickBehavior}; @@ -39,6 +25,15 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use crate::GlobalTimelines; +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::{ + WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, WAL_RECEIVER_QUEUE_SIZE_TOTAL, + WAL_RECEIVERS, +}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; +use crate::timeline::WalResidentTimeline; + const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped @@ -281,7 +276,7 @@ impl SafekeeperPostgresHandler { tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, - r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r, _ = timeline_cancel.cancelled() => { return Err(CopyStreamHandlerEnd::Cancelled); } @@ -342,8 +337,8 @@ impl NetworkReader<'_, IO> { let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with walproposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, + "start handshake with walproposer {} sysid {}", + self.peer_addr, greeting.system_id, ); let server_info = ServerInfo { pg_version: greeting.pg_version, @@ -371,7 +366,7 @@ impl NetworkReader<'_, IO> { _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( "unexpected message {next_msg:?} instead of greeting" - ))) + ))); } }; Ok((tli, next_msg)) @@ -459,6 +454,7 @@ async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); @@ -496,7 +492,7 @@ async fn network_write( }; buf.clear(); - msg.serialize(&mut buf)?; + msg.serialize(&mut buf, proto_version)?; pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 61647c16b0..c2760792b8 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -1,39 +1,36 @@ //! This module implements pulling WAL from peer safekeepers if compute can't //! provide it, i.e. safekeeper lags too much. +use std::fmt; +use std::pin::pin; use std::time::SystemTime; -use std::{fmt, pin::pin}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; -use safekeeper_api::models::{PeerInfo, TimelineStatus}; use safekeeper_api::Term; -use tokio::sync::mpsc::{channel, Receiver, Sender}; -use tokio::time::timeout; -use tokio::{ - select, - time::sleep, - time::{self, Duration}, -}; +use safekeeper_api::membership::INVALID_GENERATION; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use tokio::select; +use tokio::sync::mpsc::{Receiver, Sender, channel}; +use tokio::time::{self, Duration, sleep, timeout}; use tokio_postgres::replication::ReplicationStream; use tokio_postgres::types::PgLsn; use tracing::*; -use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol}; -use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}; - -use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; -use crate::safekeeper::{AppendRequest, AppendRequestHeader}; -use crate::timeline::WalResidentTimeline; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, - VoteRequest, - }, - SafeKeeperConf, +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, }; +use crate::SafeKeeperConf; +use crate::receive_wal::{MSG_QUEUE_SIZE, REPLY_QUEUE_SIZE, WalAcceptor}; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, TermHistory, TermLsn, VoteRequest, +}; +use crate::timeline::WalResidentTimeline; + /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] @@ -267,7 +264,10 @@ async fn recover( ); // Now understand our term history. - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: INVALID_GENERATION, + term: donor.term, + }); let vote_response = match tli .process_msg(&vote_request) .await @@ -302,10 +302,10 @@ async fn recover( // truncate WAL locally let pe = ProposerAcceptorMessage::Elected(ProposerElected { + generation: INVALID_GENERATION, term: donor.term, start_streaming_at: last_common_point.lsn, term_history: donor_th, - timeline_start_lsn: Lsn::INVALID, }); // Successful ProposerElected handling always returns None. If term changed, // we'll find out that during the streaming. Note: it is expected to get @@ -343,12 +343,17 @@ async fn recovery_stream( cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical); let connect_timeout = Duration::from_millis(10000); - let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls)) - .await + let (client, connection) = match time::timeout( + connect_timeout, + cfg.connect(tokio_postgres::NoTls), + ) + .await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { - bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open"); + bail!( + "timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open" + ); } }; trace!("connected to {:?}", donor); @@ -434,13 +439,12 @@ async fn network_io( match msg { ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { + generation: INVALID_GENERATION, term: donor.term, - term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it truncate_lsn: Lsn::INVALID, // do not attempt to advance - proposer_uuid: [0; 16], }; let ar = AppendRequest { h: ar_hdr, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index f816f8459a..0edac04b97 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1,35 +1,34 @@ //! Acceptor part of proposer-acceptor consensus algorithm. -use anyhow::{bail, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; -use safekeeper_api::models::HotStandbyFeedback; -use safekeeper_api::Term; -use serde::{Deserialize, Serialize}; -use std::cmp::max; -use std::cmp::min; +use std::cmp::{max, min}; use std::fmt; use std::io::Read; -use storage_broker::proto::SafekeeperTimelineInfo; +use std::str::FromStr; -use tracing::*; - -use crate::control_file; -use crate::metrics::MISC_OPERATION_SECONDS; - -use crate::state::TimelineState; -use crate::wal_storage; +use anyhow::{Context, Result, bail}; +use byteorder::{LittleEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; use pq_proto::SystemId; -use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, +use safekeeper_api::membership::{ + INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, }; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::{Term, membership}; +use serde::{Deserialize, Serialize}; +use storage_broker::proto::SafekeeperTimelineInfo; +use tracing::*; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -pub const SK_PROTOCOL_VERSION: u32 = 2; +use crate::metrics::MISC_OPERATION_SECONDS; +use crate::state::TimelineState; +use crate::{control_file, wal_storage}; + +pub const SK_PROTO_VERSION_2: u32 = 2; +pub const SK_PROTO_VERSION_3: u32 = 3; pub const UNKNOWN_SERVER_VERSION: u32 = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] @@ -56,8 +55,28 @@ impl TermHistory { TermHistory(Vec::new()) } - // Parse TermHistory as n_entries followed by TermLsn pairs + // Parse TermHistory as n_entries followed by TermLsn pairs in network order. pub fn from_bytes(bytes: &mut Bytes) -> Result { + let n_entries = bytes + .get_u32_f() + .with_context(|| "TermHistory misses len")?; + let mut res = Vec::with_capacity(n_entries as usize); + for i in 0..n_entries { + let term = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses term", i))?; + let lsn = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .into(); + res.push(TermLsn { term, lsn }) + } + Ok(TermHistory(res)) + } + + // Parse TermHistory as n_entries followed by TermLsn pairs in LE order. + // TODO remove once v2 protocol is fully dropped. + pub fn from_bytes_le(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -197,6 +216,18 @@ impl AcceptorState { /// Initial Proposer -> Acceptor message #[derive(Debug, Deserialize)] pub struct ProposerGreeting { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mconf: membership::Configuration, + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version @@ -213,27 +244,35 @@ pub struct ProposerGreeting { /// (acceptor voted for). #[derive(Debug, Serialize)] pub struct AcceptorGreeting { - term: u64, node_id: NodeId, + mconf: membership::Configuration, + term: u64, } /// Vote request sent from proposer to safekeepers -#[derive(Debug, Deserialize)] +#[derive(Debug)] pub struct VoteRequest { + pub generation: Generation, + pub term: Term, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct VoteRequestV2 { pub term: Term, } /// Vote itself, sent from safekeeper to proposer #[derive(Debug, Serialize)] pub struct VoteResponse { + generation: Generation, // membership conf generation pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - vote_given: u64, // fixme u64 due to padding + vote_given: bool, // Safekeeper flush_lsn (end of WAL) + history of term switches allow // proposer to choose the most advanced one. pub flush_lsn: Lsn, truncate_lsn: Lsn, pub term_history: TermHistory, - timeline_start_lsn: Lsn, } /* @@ -242,10 +281,10 @@ pub struct VoteResponse { */ #[derive(Debug)] pub struct ProposerElected { + pub generation: Generation, // membership conf generation pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, - pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -257,6 +296,22 @@ pub struct AppendRequest { } #[derive(Debug, Clone, Deserialize)] pub struct AppendRequestHeader { + pub generation: Generation, // membership conf generation + // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + pub term: Term, + /// start position of message in WAL + pub begin_lsn: Lsn, + /// end position of message in WAL + pub end_lsn: Lsn, + /// LSN committed by quorum of safekeepers + pub commit_lsn: Lsn, + /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper + pub truncate_lsn: Lsn, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Clone, Deserialize)] +pub struct AppendRequestHeaderV2 { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, // TODO: remove this field from the protocol, it in unused -- LSN of term @@ -277,6 +332,9 @@ pub struct AppendRequestHeader { /// Report safekeeper state to proposer #[derive(Debug, Serialize, Clone)] pub struct AppendResponse { + // Membership conf generation. Not strictly required because on mismatch + // connection is reset, but let's sanity check it. + generation: Generation, // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, @@ -293,8 +351,9 @@ pub struct AppendResponse { } impl AppendResponse { - fn term_only(term: Term) -> AppendResponse { + fn term_only(generation: Generation, term: Term) -> AppendResponse { AppendResponse { + generation, term, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -315,72 +374,322 @@ pub enum ProposerAcceptorMessage { FlushWAL, } -impl ProposerAcceptorMessage { - /// Parse proposer message. - pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result { - if proto_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - proto_version, - SK_PROTOCOL_VERSION - ); +/// Augment Bytes with fallible get_uN where N is number of bytes methods. +/// All reads are in network (big endian) order. +trait BytesF { + fn get_u8_f(&mut self) -> Result; + fn get_u16_f(&mut self) -> Result; + fn get_u32_f(&mut self) -> Result; + fn get_u64_f(&mut self) -> Result; +} + +impl BytesF for Bytes { + fn get_u8_f(&mut self) -> Result { + if self.is_empty() { + bail!("no bytes left, expected 1"); } - // xxx using Reader is inefficient but easy to work with bincode - let mut stream = msg_bytes.reader(); - // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is - let tag = stream.read_u64::()? as u8 as char; - match tag { - 'g' => { - let msg = ProposerGreeting::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::Greeting(msg)) - } - 'v' => { - let msg = VoteRequest::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::VoteRequest(msg)) - } - 'e' => { - let mut msg_bytes = stream.into_inner(); - if msg_bytes.remaining() < 16 { - bail!("ProposerElected message is not complete"); - } - let term = msg_bytes.get_u64_le(); - let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(&mut msg_bytes)?; - if msg_bytes.remaining() < 8 { - bail!("ProposerElected message is not complete"); - } - let timeline_start_lsn = msg_bytes.get_u64_le().into(); - let msg = ProposerElected { - term, - start_streaming_at, - timeline_start_lsn, - term_history, + Ok(self.get_u8()) + } + fn get_u16_f(&mut self) -> Result { + if self.remaining() < 2 { + bail!("no bytes left, expected 2"); + } + Ok(self.get_u16()) + } + fn get_u32_f(&mut self) -> Result { + if self.remaining() < 4 { + bail!("only {} bytes left, expected 4", self.remaining()); + } + Ok(self.get_u32()) + } + fn get_u64_f(&mut self) -> Result { + if self.remaining() < 8 { + bail!("only {} bytes left, expected 8", self.remaining()); + } + Ok(self.get_u64()) + } +} + +impl ProposerAcceptorMessage { + /// Read cstring from Bytes. + fn get_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?; + let result = buf.split_to(pos); + buf.advance(1); // drop the null terminator + match std::str::from_utf8(&result) { + Ok(s) => Ok(s.to_string()), + Err(e) => bail!("invalid utf8 in cstring: {}", e), + } + } + + /// Read membership::Configuration from Bytes. + fn get_mconf(buf: &mut Bytes) -> Result { + let generation = Generation::new(buf.get_u32_f().with_context(|| "reading generation")?); + let members_len = buf.get_u32_f().with_context(|| "reading members_len")?; + // Main member set must have at least someone in valid configuration. + // Empty conf is allowed until we fully migrate. + if generation != INVALID_GENERATION && members_len == 0 { + bail!("empty members_len"); + } + let mut members = MemberSet::empty(); + for i in 0..members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading member {} node_id", i))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, + }; + members.add(sk)?; + } + let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?; + // Non joint conf. + if new_members_len == 0 { + Ok(membership::Configuration { + generation, + members, + new_members: None, + }) + } else { + let mut new_members = MemberSet::empty(); + for i in 0..new_members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading new member {} node_id", i))?; + let host = Self::get_cstr(buf) + .with_context(|| format!("reading new member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading new member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, }; - Ok(ProposerAcceptorMessage::Elected(msg)) + new_members.add(sk)?; } - 'a' => { - // read header followed by wal data - let hdr = AppendRequestHeader::des_from(&mut stream)?; - let rec_size = hdr - .end_lsn - .checked_sub(hdr.begin_lsn) - .context("begin_lsn > end_lsn in AppendRequest")? - .0 as usize; - if rec_size > MAX_SEND_SIZE { - bail!( - "AppendRequest is longer than MAX_SEND_SIZE ({})", - MAX_SEND_SIZE - ); + Ok(membership::Configuration { + generation, + members, + new_members: Some(new_members), + }) + } + } + + /// Parse proposer message. + pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version == SK_PROTO_VERSION_3 { + if msg_bytes.is_empty() { + bail!("ProposerAcceptorMessage is not complete: missing tag"); + } + let tag = msg_bytes.get_u8_f().with_context(|| { + "ProposerAcceptorMessage is not complete: missing tag".to_string() + })? as char; + match tag { + 'g' => { + let tenant_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?; + let tenant_id = TenantId::from_str(&tenant_id_str)?; + let timeline_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?; + let timeline_id = TimelineId::from_str(&timeline_id_str)?; + let mconf = Self::get_mconf(&mut msg_bytes)?; + let pg_version = msg_bytes + .get_u32_f() + .with_context(|| "reading pg_version")?; + let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?; + let wal_seg_size = msg_bytes + .get_u32_f() + .with_context(|| "reading wal_seg_size")?; + let g = ProposerGreeting { + tenant_id, + timeline_id, + mconf, + pg_version, + system_id, + wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) } + 'v' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let v = VoteRequest { generation, term }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let start_streaming_at: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading start_streaming_at")? + .into(); + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + let msg = ProposerElected { + generation, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let begin_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading begin_lsn")? + .into(); + let end_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading end_lsn")? + .into(); + let commit_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading commit_lsn")? + .into(); + let truncate_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading truncate_lsn")? + .into(); + let hdr = AppendRequestHeader { + generation, + term, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + if msg_bytes.remaining() < rec_size { + bail!( + "reading WAL: only {} bytes left, wanted {}", + msg_bytes.remaining(), + rec_size + ); + } + let wal_data = msg_bytes.copy_to_bytes(rec_size); + let msg = AppendRequest { h: hdr, wal_data }; - let mut wal_data_vec: Vec = vec![0; rec_size]; - stream.read_exact(&mut wal_data_vec)?; - let wal_data = Bytes::from(wal_data_vec); - let msg = AppendRequest { h: hdr, wal_data }; - - Ok(ProposerAcceptorMessage::AppendRequest(msg)) + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), } - _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } else if proto_version == SK_PROTO_VERSION_2 { + // xxx using Reader is inefficient but easy to work with bincode + let mut stream = msg_bytes.reader(); + // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is + let tag = stream.read_u64::()? as u8 as char; + match tag { + 'g' => { + let msgv2 = ProposerGreetingV2::des_from(&mut stream)?; + let g = ProposerGreeting { + tenant_id: msgv2.tenant_id, + timeline_id: msgv2.timeline_id, + mconf: membership::Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + }, + pg_version: msgv2.pg_version, + system_id: msgv2.system_id, + wal_seg_size: msgv2.wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) + } + 'v' => { + let msg = VoteRequestV2::des_from(&mut stream)?; + let v = VoteRequest { + generation: INVALID_GENERATION, + term: msg.term, + }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let mut msg_bytes = stream.into_inner(); + if msg_bytes.remaining() < 16 { + bail!("ProposerElected message is not complete"); + } + let term = msg_bytes.get_u64_le(); + let start_streaming_at = msg_bytes.get_u64_le().into(); + let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let _timeline_start_lsn = msg_bytes.get_u64_le(); + let msg = ProposerElected { + generation: INVALID_GENERATION, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + // read header followed by wal data + let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?; + let hdr = AppendRequestHeader { + generation: INVALID_GENERATION, + term: hdrv2.term, + begin_lsn: hdrv2.begin_lsn, + end_lsn: hdrv2.end_lsn, + commit_lsn: hdrv2.commit_lsn, + truncate_lsn: hdrv2.truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + + let mut wal_data_vec: Vec = vec![0; rec_size]; + stream.read_exact(&mut wal_data_vec)?; + let wal_data = Bytes::from(wal_data_vec); + + let msg = AppendRequest { h: hdr, wal_data }; + + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } + } else { + bail!("unsupported protocol version {}", proto_version); } } @@ -394,36 +703,21 @@ impl ProposerAcceptorMessage { // We explicitly list all fields, to draw attention here when new fields are added. let mut size = BASE_SIZE; size += match self { - Self::Greeting(ProposerGreeting { - protocol_version: _, - pg_version: _, - proposer_id: _, - system_id: _, - timeline_id: _, - tenant_id: _, - tli: _, - wal_seg_size: _, - }) => 0, + Self::Greeting(_) => 0, - Self::VoteRequest(VoteRequest { term: _ }) => 0, + Self::VoteRequest(_) => 0, - Self::Elected(ProposerElected { - term: _, - start_streaming_at: _, - term_history: _, - timeline_start_lsn: _, - }) => 0, + Self::Elected(_) => 0, Self::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -431,13 +725,12 @@ impl ProposerAcceptorMessage { Self::NoFlushAppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -458,45 +751,118 @@ pub enum AcceptorProposerMessage { } impl AcceptorProposerMessage { - /// Serialize acceptor -> proposer message. - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - match self { - AcceptorProposerMessage::Greeting(msg) => { - buf.put_u64_le('g' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.node_id.0); - } - AcceptorProposerMessage::VoteResponse(msg) => { - buf.put_u64_le('v' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.vote_given); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.truncate_lsn.into()); - buf.put_u32_le(msg.term_history.0.len() as u32); - for e in &msg.term_history.0 { - buf.put_u64_le(e.term); - buf.put_u64_le(e.lsn.into()); - } - buf.put_u64_le(msg.timeline_start_lsn.into()); - } - AcceptorProposerMessage::AppendResponse(msg) => { - buf.put_u64_le('a' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.commit_lsn.into()); - buf.put_i64_le(msg.hs_feedback.ts); - buf.put_u64_le(msg.hs_feedback.xmin); - buf.put_u64_le(msg.hs_feedback.catalog_xmin); + fn put_cstr(buf: &mut BytesMut, s: &str) { + buf.put_slice(s.as_bytes()); + buf.put_u8(0); // null terminator + } - // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback - // if it is not present. - if let Some(ref msg) = msg.pageserver_feedback { - msg.serialize(buf); - } - } + /// Serialize membership::Configuration into buf. + fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) { + buf.put_u32(mconf.generation.into_inner()); + buf.put_u32(mconf.members.m.len() as u32); + for sk in &mconf.members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); } + if let Some(ref new_members) = mconf.new_members { + buf.put_u32(new_members.m.len() as u32); + for sk in &new_members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); + } + } else { + buf.put_u32(0); + } + } - Ok(()) + /// Serialize acceptor -> proposer message. + pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> { + if proto_version == SK_PROTO_VERSION_3 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u8(b'g'); + buf.put_u64(msg.node_id.0); + Self::serialize_mconf(buf, &msg.mconf); + buf.put_u64(msg.term) + } + AcceptorProposerMessage::VoteResponse(msg) => { + buf.put_u8(b'v'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u8(msg.vote_given as u8); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.truncate_lsn.into()); + buf.put_u32(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64(e.term); + buf.put_u64(e.lsn.into()); + } + } + AcceptorProposerMessage::AppendResponse(msg) => { + buf.put_u8(b'a'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.commit_lsn.into()); + buf.put_i64(msg.hs_feedback.ts); + buf.put_u64(msg.hs_feedback.xmin); + buf.put_u64(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + // TODO remove 3 after converting all msgs + } else if proto_version == SK_PROTO_VERSION_2 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u64_le('g' as u64); + // v2 didn't have mconf and fields were reordered + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); + } + AcceptorProposerMessage::VoteResponse(msg) => { + // v2 didn't have generation, had u64 vote_given and timeline_start_lsn + buf.put_u64_le('v' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.vote_given as u64); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.truncate_lsn.into()); + buf.put_u32_le(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64_le(e.term); + buf.put_u64_le(e.lsn.into()); + } + // removed timeline_start_lsn + buf.put_u64_le(0); + } + AcceptorProposerMessage::AppendResponse(msg) => { + // v2 didn't have generation + buf.put_u64_le('a' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.commit_lsn.into()); + buf.put_i64_le(msg.hs_feedback.ts); + buf.put_u64_le(msg.hs_feedback.xmin); + buf.put_u64_le(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + } else { + bail!("unsupported protocol version {}", proto_version); + } } } @@ -593,14 +959,6 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - // Check protocol compatibility - if msg.protocol_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - msg.protocol_version, - SK_PROTOCOL_VERSION - ); - } /* Postgres major version mismatch is treated as fatal error * because safekeepers parse WAL headers and the format * may change between versions. @@ -655,15 +1013,16 @@ where self.state.finish_change(&state).await?; } - info!( - "processed greeting from walproposer {}, sending term {:?}", - msg.proposer_id.map(|b| format!("{:X}", b)).join(""), - self.state.acceptor_state.term - ); - Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.state.acceptor_state.term, + let apg = AcceptorGreeting { node_id: self.node_id, - }))) + mconf: self.state.mconf.clone(), + term: self.state.acceptor_state.term, + }; + info!( + "processed greeting {:?} from walproposer, sending {:?}", + msg, apg + ); + Ok(Some(AcceptorProposerMessage::Greeting(apg))) } /// Give vote for the given term, if we haven't done that previously. @@ -684,12 +1043,12 @@ where self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, - vote_given: false as u64, + vote_given: false, flush_lsn: self.flush_lsn(), truncate_lsn: self.state.inmem.peer_horizon_lsn, term_history: self.get_term_history(), - timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); @@ -698,15 +1057,16 @@ where self.state.finish_change(&state).await?; resp.term = self.state.acceptor_state.term; - resp.vote_given = true as u64; + resp.vote_given = true; } - info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); + info!("processed {:?}: sending {:?}", msg, &resp); Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, @@ -769,9 +1129,14 @@ where // and walproposer recalculates the streaming point. OTOH repeating // error indicates a serious bug. if last_common_point.lsn != msg.start_streaming_at { - bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - last_common_point, msg.start_streaming_at, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + bail!( + "refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, + msg.start_streaming_at, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); } @@ -779,8 +1144,12 @@ where assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - msg.start_streaming_at, self.state.inmem.commit_lsn, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + msg.start_streaming_at, + self.state.inmem.commit_lsn, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -805,18 +1174,22 @@ where // Here we learn initial LSN for the first time, set fields // interested in that. - if state.timeline_start_lsn == Lsn(0) { - // Remember point where WAL begins globally. - state.timeline_start_lsn = msg.timeline_start_lsn; - info!( - "setting timeline_start_lsn to {:?}", - state.timeline_start_lsn - ); + if let Some(start_lsn) = msg.term_history.0.first() { + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. In the future it + // will be intialized immediately on timeline creation. + state.timeline_start_lsn = start_lsn.lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } } + if state.peer_horizon_lsn == Lsn(0) { // Update peer_horizon_lsn as soon as we know where timeline starts. // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn. - state.peer_horizon_lsn = msg.timeline_start_lsn; + state.peer_horizon_lsn = state.timeline_start_lsn; } if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; @@ -896,7 +1269,10 @@ where // If our term is higher, immediately refuse the message. if self.state.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.state.acceptor_state.term); + let resp = AppendResponse::term_only( + self.state.mconf.generation, + self.state.acceptor_state.term, + ); return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } @@ -924,10 +1300,8 @@ where ); } - // Now we know that we are in the same term as the proposer, - // processing the message. - - self.state.inmem.proposer_uuid = msg.h.proposer_uuid; + // Now we know that we are in the same term as the proposer, process the + // message. // do the job if !msg.wal_data.is_empty() { @@ -1000,21 +1374,19 @@ where #[cfg(test)] mod tests { - use futures::future::BoxFuture; + use std::ops::Deref; + use std::str::FromStr; + use std::time::{Instant, UNIX_EPOCH}; - use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; - use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, - ServerInfo, + use futures::future::BoxFuture; + use postgres_ffi::{WAL_SEGMENT_SIZE, XLogSegNo}; + use safekeeper_api::ServerInfo; + use safekeeper_api::membership::{ + Configuration, MemberSet, SafekeeperGeneration, SafekeeperId, }; use super::*; use crate::state::{EvictionState, TimelinePersistentState}; - use std::{ - ops::Deref, - str::FromStr, - time::{Instant, UNIX_EPOCH}, - }; // fake storage for tests struct InMemoryState { @@ -1097,10 +1469,13 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: Generation::new(0), + term: 1, + }); let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), r => panic!("unexpected response: {:?}", r), } @@ -1115,7 +1490,7 @@ mod tests { // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), r => panic!("unexpected response: {:?}", r), } } @@ -1130,13 +1505,12 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 2, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let mut append_request = AppendRequest { h: ar_hdr.clone(), @@ -1144,6 +1518,7 @@ mod tests { }; let pem = ProposerElected { + generation: Generation::new(0), term: 2, start_streaming_at: Lsn(1), term_history: TermHistory(vec![ @@ -1156,7 +1531,6 @@ mod tests { lsn: Lsn(3), }, ]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await @@ -1191,26 +1565,25 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let pem = ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: Lsn(1), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(1), }]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); let ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let append_request = AppendRequest { h: ar_hdr.clone(), diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 5916675c3f..e196f91d3c 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -3,23 +3,22 @@ use std::fmt::Display; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; -use futures::future::Either; +use anyhow::{Context, anyhow}; use futures::StreamExt; +use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::waldecoder::WalDecodeError; -use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; +use postgres_ffi::get_current_timestamp; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use utils::critical; use utils::lsn::Lsn; -use utils::postgres_client::Compression; -use utils::postgres_client::InterpretedFormat; +use utils::postgres_client::{Compression, InterpretedFormat}; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; use wal_decoder::wire_format::ToWireFormat; @@ -100,7 +99,12 @@ struct ShardSenderState { /// State of [`InterpretedWalReader`] visible outside of the task running it. #[derive(Debug)] pub(crate) enum InterpretedWalReaderState { - Running { current_position: Lsn }, + Running { + current_position: Lsn, + /// Tracks the start of the PG WAL LSN from which the current batch of + /// interpreted records originated. + current_batch_wal_start: Option, + }, Done, } @@ -122,14 +126,21 @@ pub enum InterpretedWalReaderError { } enum CurrentPositionUpdate { - Reset(Lsn), + Reset { from: Lsn, to: Lsn }, NotReset(Lsn), } impl CurrentPositionUpdate { fn current_position(&self) -> Lsn { match self { - CurrentPositionUpdate::Reset(lsn) => *lsn, + CurrentPositionUpdate::Reset { from: _, to } => *to, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } + + fn previous_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset { from, to: _ } => *from, CurrentPositionUpdate::NotReset(lsn) => *lsn, } } @@ -145,16 +156,33 @@ impl InterpretedWalReaderState { } } + #[cfg(test)] + fn current_batch_wal_start(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => *current_batch_wal_start, + InterpretedWalReaderState::Done => None, + } + } + // Reset the current position of the WAL reader if the requested starting position // of the new shard is smaller than the current value. fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate { match self { InterpretedWalReaderState::Running { - current_position, .. + current_position, + current_batch_wal_start, } => { if new_shard_start_pos < *current_position { + let from = *current_position; *current_position = new_shard_start_pos; - CurrentPositionUpdate::Reset(*current_position) + *current_batch_wal_start = None; + CurrentPositionUpdate::Reset { + from, + to: *current_position, + } } else { CurrentPositionUpdate::NotReset(*current_position) } @@ -164,6 +192,47 @@ impl InterpretedWalReaderState { } } } + + fn update_current_batch_wal_start(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => { + if current_batch_wal_start.is_none() { + *current_batch_wal_start = Some(lsn); + } + } + InterpretedWalReaderState::Done => { + panic!("update_current_batch_wal_start called on finished reader") + } + } + } + + fn take_current_batch_wal_start(&mut self) -> Lsn { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => current_batch_wal_start.take().unwrap(), + InterpretedWalReaderState::Done => { + panic!("take_current_batch_wal_start called on finished reader") + } + } + } + + fn update_current_position(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => { + *current_position = lsn; + } + InterpretedWalReaderState::Done => { + panic!("update_current_position called on finished reader") + } + } + } } pub(crate) struct AttachShardNotification { @@ -184,6 +253,7 @@ impl InterpretedWalReader { ) -> InterpretedWalReaderHandle { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); @@ -237,9 +307,13 @@ impl InterpretedWalReader { tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, pg_version: u32, + shard_notification_rx: Option< + tokio::sync::mpsc::UnboundedReceiver, + >, ) -> InterpretedWalReader { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); InterpretedWalReader { @@ -252,7 +326,7 @@ impl InterpretedWalReader { next_record_lsn: start_pos, }], )]), - shard_notification_rx: None, + shard_notification_rx, state: state.clone(), pg_version, } @@ -302,7 +376,7 @@ impl InterpretedWalReader { let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below")); let WalBytes { wal, - wal_start_lsn: _, + wal_start_lsn, wal_end_lsn, available_wal_end_lsn, } = match wal { @@ -315,6 +389,8 @@ impl InterpretedWalReader { } }; + self.state.write().unwrap().update_current_batch_wal_start(wal_start_lsn); + wal_decoder.feed_bytes(&wal); // Deserialize and interpret WAL records from this batch of WAL. @@ -363,35 +439,49 @@ impl InterpretedWalReader { let max_next_record_lsn = match max_next_record_lsn { Some(lsn) => lsn, - None => { continue; } + None => { + continue; + } }; // Update the current position such that new receivers can decide // whether to attach to us or spawn a new WAL reader. - match &mut *self.state.write().unwrap() { - InterpretedWalReaderState::Running { current_position, .. } => { - *current_position = max_next_record_lsn; - }, - InterpretedWalReaderState::Done => { - unreachable!() - } - } + let batch_wal_start_lsn = { + let mut guard = self.state.write().unwrap(); + guard.update_current_position(max_next_record_lsn); + guard.take_current_batch_wal_start() + }; // Send interpreted records downstream. Anything that has already been seen // by a shard is filtered out. let mut shard_senders_to_remove = Vec::new(); for (shard, states) in &mut self.shard_senders { for state in states { - if max_next_record_lsn <= state.next_record_lsn { - continue; - } - let shard_sender_id = ShardSenderId::new(*shard, state.sender_id); - let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); - let batch = InterpretedWalRecords { - records, - next_record_lsn: Some(max_next_record_lsn), + let batch = if max_next_record_lsn > state.next_record_lsn { + // This batch contains at least one record that this shard has not + // seen yet. + let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); + + InterpretedWalRecords { + records, + next_record_lsn: max_next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else if wal_end_lsn > state.next_record_lsn { + // All the records in this batch were seen by the shard + // However, the batch maps to a chunk of WAL that the + // shard has not yet seen. Notify it of the start LSN + // of the PG WAL chunk such that it doesn't look like a gap. + InterpretedWalRecords { + records: Vec::default(), + next_record_lsn: state.next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else { + // The shard has seen this chunk of WAL before. Skip it. + continue; }; let res = state.tx.send(Batch { @@ -403,7 +493,7 @@ impl InterpretedWalReader { if res.is_err() { shard_senders_to_remove.push(shard_sender_id); } else { - state.next_record_lsn = max_next_record_lsn; + state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn); } } } @@ -451,7 +541,7 @@ impl InterpretedWalReader { // anything outside the select statement. let position_reset = self.state.write().unwrap().maybe_reset(start_pos); match position_reset { - CurrentPositionUpdate::Reset(to) => { + CurrentPositionUpdate::Reset { from: _, to } => { self.wal_stream.reset(to).await; wal_decoder = WalStreamDecoder::new(to, self.pg_version); }, @@ -459,14 +549,22 @@ impl InterpretedWalReader { }; tracing::info!( - "Added shard sender {} with start_pos={} current_pos={}", - ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position() + "Added shard sender {} with start_pos={} previous_pos={} current_pos={}", + ShardSenderId::new(shard_id, new_sender_id), + start_pos, + position_reset.previous_position(), + position_reset.current_position(), ); } } } } } + + #[cfg(test)] + fn state(&self) -> Arc> { + self.state.clone() + } } impl InterpretedWalReaderHandle { @@ -592,22 +690,20 @@ impl InterpretedWalSender<'_, IO> { } #[cfg(test)] mod tests { - use std::{collections::HashMap, str::FromStr, time::Duration}; + use std::collections::HashMap; + use std::str::FromStr; + use std::time::Duration; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::MAX_SEND_SIZE; use tokio::sync::mpsc::error::TryRecvError; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - shard::{ShardCount, ShardNumber}, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; + use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - send_interpreted_wal::{Batch, InterpretedWalReader}, - test_utils::Env, - wal_reader_stream::StreamingWalReader, - }; + use crate::send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_interpreted_wal_reader_fanout() { @@ -709,9 +805,11 @@ mod tests { // This test uses logical messages. Those only go to shard 0. Check that the // filtering worked and shard 1 did not get any. - assert!(shard_1_interpreted_records - .iter() - .all(|recs| recs.records.is_empty())); + assert!( + shard_1_interpreted_records + .iter() + .all(|recs| recs.records.is_empty()) + ); // Shard 0 should not receive anything more since the reader is // going through wal that it has already processed. @@ -884,4 +982,128 @@ mod tests { assert_eq!(sender.received_next_record_lsns, expected); } } + + #[tokio::test] + async fn test_batch_start_tracking_on_reset() { + // When the WAL stream is reset to an older LSN, + // the current batch start LSN should be invalidated. + // This test constructs such a scenario: + // 1. Shard 0 is reading somewhere ahead + // 2. Reader reads some WAL, but does not decode a full record (partial read) + // 3. Shard 1 attaches to the reader and resets it to an older LSN + // 4. Shard 1 should get the correct batch WAL start LSN + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 64 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + const WAL_READER_BATCH_SIZE: usize = 8192; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let mut next_record_lsns = Vec::default(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = + Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) + .await + .unwrap(); + + assert!(next_record_lsns.len() > 3); + let shard_0_start_lsn = next_record_lsns[3]; + + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + shard_0_start_lsn, + end_pos, + end_watch, + WAL_READER_BATCH_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader::new( + streaming_wal_reader, + shard_0_start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + Some(shard_notification_rx), + ); + + let reader_state = reader.state(); + let mut reader_fut = std::pin::pin!(reader.run(shard_0_start_lsn, &None)); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let guard = reader_state.read().unwrap(); + if guard.current_batch_wal_start().is_some() { + break; + } + } + + shard_notification_tx + .send(AttachShardNotification { + shard_id: shard_1, + sender: shards.get_mut(&shard_1).unwrap().0.take().unwrap(), + start_pos: start_lsn, + }) + .unwrap(); + + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let try_recv_res = shard_1_rx.try_recv(); + match try_recv_res { + Ok(batch) => { + assert_eq!(batch.records.raw_wal_start_lsn.unwrap(), start_lsn); + break; + } + Err(tokio::sync::mpsc::error::TryRecvError::Empty) => {} + Err(tokio::sync::mpsc::error::TryRecvError::Disconnected) => { + unreachable!(); + } + } + } + } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4a4a74a0fd..33e3d0485c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,6 +1,34 @@ //! This module implements the streaming side of replication protocol, starting //! with the "START_REPLICATION" message, and registry of walsenders. +use std::cmp::{max, min}; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context as AnyhowContext, bail}; +use bytes::Bytes; +use futures::FutureExt; +use itertools::Itertools; +use parking_lot::Mutex; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; +use safekeeper_api::Term; +use safekeeper_api::models::{ + HotStandbyFeedback, INVALID_FULL_TRANSACTION_ID, ReplicationFeedback, StandbyFeedback, + StandbyReply, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::watch::Receiver; +use tokio::time::timeout; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::failpoint_support; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; + use crate::handler::SafekeeperPostgresHandler; use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS}; use crate::receive_wal::WalReceivers; @@ -11,34 +39,6 @@ use crate::send_interpreted_wal::{ use crate::timeline::WalResidentTimeline; use crate::wal_reader_stream::StreamingWalReader; use crate::wal_storage::WalReader; -use anyhow::{bail, Context as AnyhowContext}; -use bytes::Bytes; -use futures::FutureExt; -use parking_lot::Mutex; -use postgres_backend::PostgresBackend; -use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; -use postgres_ffi::get_current_timestamp; -use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use safekeeper_api::models::{ - HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, - INVALID_FULL_TRANSACTION_ID, -}; -use safekeeper_api::Term; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::failpoint_support; -use utils::pageserver_feedback::PageserverFeedback; -use utils::postgres_client::PostgresClientProtocol; - -use itertools::Itertools; -use std::cmp::{max, min}; -use std::net::SocketAddr; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::watch::Receiver; -use tokio::time::timeout; -use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -624,8 +624,9 @@ impl SafekeeperPostgresHandler { MAX_SEND_SIZE, ); - let reader = - InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version); + let reader = InterpretedWalReader::new( + wal_reader, start_pos, tx, shard, pg_version, None, + ); let sender = InterpretedWalSender { format, @@ -905,9 +906,9 @@ impl WalSender<'_, IO> { // pageserver to identify WalReceiverError::SuccessfulCompletion, // do not change this string without updating pageserver. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); } } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 4d566b12a0..e437e6d2cd 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,28 +1,24 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref, time::SystemTime}; +use std::cmp::max; +use std::ops::Deref; +use std::time::SystemTime; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::{ - membership::Configuration, - models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}, - ServerInfo, Term, INITIAL_TERM, -}; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; -use crate::{ - control_file, - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}, - timeline::TimelineError, - wal_backup_partial::{self}, -}; +use crate::control_file; +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}; +use crate::timeline::TimelineError; +use crate::wal_backup_partial::{self}; /// Persistent information stored on safekeeper node about timeline. /// On disk data is prefixed by magic and format version and followed by checksum. diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 79ceddd366..e6f74185c1 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,5 +1,12 @@ use std::sync::Arc; +use camino_tempfile::Utf8TempDir; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use safekeeper_api::membership::SafekeeperGeneration as Generation; +use tokio::fs::create_dir_all; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + use crate::rate_limit::RateLimiter; use crate::receive_wal::WalAcceptor; use crate::safekeeper::{ @@ -8,15 +15,10 @@ use crate::safekeeper::{ }; use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; -use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::remote_timeline_path; -use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf}; -use camino_tempfile::Utf8TempDir; -use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; -use tokio::fs::create_dir_all; -use utils::id::{NodeId, TenantTimelineId}; -use utils::lsn::Lsn; +use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. pub struct Env { @@ -73,10 +75,10 @@ impl Env { // Emulate an initial election. safekeeper .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: start_lsn, term_history: TermHistory(vec![(1, start_lsn).into()]), - timeline_start_lsn: start_lsn, })) .await?; @@ -146,13 +148,12 @@ impl Env { let req = AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: start_lsn, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: lsn, truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4341f13824..930f66a207 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,37 +1,32 @@ //! This module implements Timeline lifecycle management and has all necessary code //! to glue together SafeKeeper and all other background services. -use anyhow::{anyhow, bail, Result}; +use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::Duration; + +use anyhow::{Result, anyhow, bail}; use camino::{Utf8Path, Utf8PathBuf}; +use http_utils::error::ApiError; use remote_storage::RemotePath; +use safekeeper_api::Term; use safekeeper_api::membership::Configuration; use safekeeper_api::models::{ PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse, }; -use safekeeper_api::Term; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::fs::{self}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, watch}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use tracing::*; +use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::lsn::Lsn; use utils::sync::gate::Gate; -use http_utils::error::ApiError; -use std::cmp::max; -use std::ops::{Deref, DerefMut}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; -use tokio::{sync::watch, time::Instant}; -use tracing::*; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; - -use crate::control_file; +use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics}; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; @@ -42,11 +37,8 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; - -use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; -use crate::SafeKeeperConf; -use crate::{debug_dump, timeline_manager, wal_storage}; +use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { PeerInfo { @@ -168,7 +160,7 @@ impl StateSK { pub fn state(&self) -> &TimelineState { match self { StateSK::Loaded(sk) => &sk.state, - StateSK::Offloaded(ref s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -176,7 +168,7 @@ impl StateSK { pub fn state_mut(&mut self) -> &mut TimelineState { match self { StateSK::Loaded(sk) => &mut sk.state, - StateSK::Offloaded(ref mut s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -566,11 +558,18 @@ impl Timeline { }); } - /// Background timeline activities (which hold Timeline::gate) will no - /// longer run once this function completes. - pub async fn shutdown(&self) { + /// Cancel the timeline, requesting background activity to stop. Closing + /// the `self.gate` waits for that. + pub async fn cancel(&self) { info!("timeline {} shutting down", self.ttid); self.cancel.cancel(); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. `Self::cancel` must have been + /// already called. + pub async fn close(&self) { + assert!(self.cancel.is_cancelled()); // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts // to read deleted files. @@ -582,13 +581,13 @@ impl Timeline { /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. /// - /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) + /// Timeline must be in shut-down state (i.e. call [`Self::close`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - // Assert that [`Self::shutdown`] was already called + // Assert that [`Self::close`] was already called assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); @@ -1114,7 +1113,7 @@ impl ManagerTimeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -async fn delete_dir(path: &Utf8PathBuf) -> Result { +pub async fn delete_dir(path: &Utf8PathBuf) -> Result { match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 303421c837..06ccb32d03 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -7,23 +7,19 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; -use tokio::{ - fs::File, - io::{AsyncRead, AsyncWriteExt}, -}; +use tokio::fs::File; +use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; -use crate::{ - metrics::{ - EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, - }, - rate_limit::rand_duration, - timeline_manager::{Manager, StateSnapshot}, - wal_backup, - wal_backup_partial::{self, PartialRemoteSegment}, - wal_storage::wal_file_paths, +use crate::metrics::{ + EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, EvictionEvent, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::rand_duration; +use crate::timeline_manager::{Manager, StateSnapshot}; +use crate::wal_backup; +use crate::wal_backup_partial::{self, PartialRemoteSegment}; +use crate::wal_storage::wal_file_paths; impl Manager { /// Returns true if the timeline is ready for eviction. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index a33994dcab..71e99a4de7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -7,41 +7,36 @@ //! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. //! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. -use std::{ - sync::{atomic::AtomicUsize, Arc}, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::time::Duration; use futures::channel::oneshot; use postgres_ffi::XLogSegNo; -use safekeeper_api::{models::PeerInfo, Term}; +use safekeeper_api::Term; +use safekeeper_api::models::PeerInfo; use serde::{Deserialize, Serialize}; -use tokio::{ - task::{JoinError, JoinHandle}, - time::Instant, -}; +use tokio::task::{JoinError, JoinHandle}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; use utils::lsn::Lsn; -use crate::{ - control_file::{FileStorage, Storage}, - metrics::{ - MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, - NUM_EVICTED_TIMELINES, - }, - rate_limit::{rand_duration, RateLimiter}, - recovery::recovery_main, - remove_wal::calc_horizon_lsn, - send_wal::WalSenders, - state::TimelineState, - timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}, - timeline_guard::{AccessService, GuardId, ResidenceGuard}, - timelines_set::{TimelineSetGuard, TimelinesSet}, - wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::control_file::{FileStorage, Storage}; +use crate::metrics::{ + MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::recovery::recovery_main; +use crate::remove_wal::calc_horizon_lsn; +use crate::send_wal::WalSenders; +use crate::state::TimelineState; +use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; +use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; +use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; +use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { // inmem values diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 1ff6a72bce..858dfce807 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,31 +2,33 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. -use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; -use crate::rate_limit::RateLimiter; -use crate::state::TimelinePersistentState; -use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; -use crate::timelines_set::TimelinesSet; -use crate::wal_storage::Storage; -use crate::{control_file, wal_storage, SafeKeeperConf}; -use anyhow::{bail, Context, Result}; +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; use safekeeper_api::models::SafekeeperUtilization; -use safekeeper_api::ServerInfo; +use safekeeper_api::{ServerInfo, membership}; use serde::Serialize; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{Arc, Mutex}; -use std::time::{Duration, Instant}; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::http::routes::DeleteOrExcludeError; +use crate::rate_limit::RateLimiter; +use crate::state::TimelinePersistentState; +use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; +use crate::timelines_set::TimelinesSet; +use crate::wal_storage::Storage; +use crate::{SafeKeeperConf, control_file, wal_storage}; + // Timeline entry in the global map: either a ready timeline, or mark that it is // being created. #[derive(Clone)] @@ -446,23 +448,20 @@ impl GlobalTimelines { .collect() } - /// Cancels timeline, then deletes the corresponding data directory. - /// If only_local, doesn't remove WAL segments in remote storage. - pub(crate) async fn delete( + /// Delete timeline, only locally on this node or globally (also cleaning + /// remote storage WAL), depending on `action` value. + pub(crate) async fn delete_or_exclude( &self, ttid: &TenantTimelineId, - only_local: bool, - ) -> Result { + action: DeleteOrExclude, + ) -> Result { let tli_res = { let state = self.state.lock().unwrap(); if state.tombstones.contains_key(ttid) { // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. info!("Timeline {ttid} was already deleted"); - return Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active: false, - }); + return Ok(TimelineDeleteResult { dir_existed: false }); } state.get(ttid) @@ -470,32 +469,47 @@ impl GlobalTimelines { let result = match tli_res { Ok(timeline) => { - let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, action={:?}", ttid, action); - info!("deleting timeline {}, only_local={}", ttid, only_local); - timeline.shutdown().await; + // If node is getting excluded, check the generation first. + // Then, while holding the lock cancel the timeline; it will be + // unusable after this point, and if node is added back first + // deletion must be completed and node seeded anew. + // + // We would like to avoid holding the lock while waiting for the + // gate to finish as this is deadlock prone, so for actual + // deletion will take it second time. + if let DeleteOrExclude::Exclude(ref mconf) = action { + let shared_state = timeline.read_shared_state().await; + if shared_state.sk.state().mconf.generation > mconf.generation { + return Err(DeleteOrExcludeError::Conflict { + requested: mconf.clone(), + current: shared_state.sk.state().mconf.clone(), + }); + } + timeline.cancel().await; + } else { + timeline.cancel().await; + } + + timeline.close().await; info!("timeline {ttid} shut down for deletion"); // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; + let only_local = !matches!(action, DeleteOrExclude::Delete); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active, // TODO: we probably should remove this field - }) + Ok(TimelineDeleteResult { dir_existed }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid); - let dir_existed = delete_dir(dir_path)?; + let dir_existed = delete_dir(&dir_path).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active: false, - }) + Ok(TimelineDeleteResult { dir_existed }) } }; @@ -513,11 +527,11 @@ impl GlobalTimelines { /// retry tenant deletion again later. /// /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete_force_all_for_tenant( + pub async fn delete_all_for_tenant( &self, tenant_id: &TenantId, - only_local: bool, - ) -> Result> { + action: DeleteOrExclude, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let to_delete = self.get_all_for_tenant(*tenant_id); @@ -525,7 +539,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match self.delete(&tli.ttid, only_local).await { + match self.delete_or_exclude(&tli.ttid, action.clone()).await { Ok(result) => { deleted.insert(tli.ttid, result); } @@ -539,17 +553,15 @@ impl GlobalTimelines { // If there was an error, return it. if let Some(e) = err { - return Err(e); + return Err(anyhow::Error::from(e)); } // There may be broken timelines on disk, so delete the whole tenant dir as well. // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir(get_tenant_dir( - self.state.lock().unwrap().conf.as_ref(), - tenant_id, - ))?; + let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id); + delete_dir(&tenant_dir).await?; Ok(deleted) } @@ -568,18 +580,20 @@ impl GlobalTimelines { } #[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { +pub struct TimelineDeleteResult { pub dir_existed: bool, - pub was_active: bool, } -/// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: Utf8PathBuf) -> Result { - match std::fs::remove_dir_all(path) { - Ok(_) => Ok(true), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(e) => Err(e.into()), - } +/// Action for delete_or_exclude. +#[derive(Clone, Debug)] +pub enum DeleteOrExclude { + /// Delete timeline globally. + Delete, + /// Legacy mode until we fully migrate to generations: like exclude deletes + /// timeline only locally, but ignores generation number. + DeleteLocal, + /// This node is getting excluded, delete timeline locally. + Exclude(membership::Configuration), } /// Create temp directory for a new timeline. It needs to be located on the same diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index 096e348295..1d1abc530f 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TenantTimelineId; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 2f6b91cf47..6176e64698 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,34 +1,29 @@ -use anyhow::{Context, Result}; - -use camino::{Utf8Path, Utf8PathBuf}; -use futures::stream::FuturesOrdered; -use futures::StreamExt; -use safekeeper_api::models::PeerInfo; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; -use utils::backoff; -use utils::id::NodeId; - use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::time::Duration; +use anyhow::{Context, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use futures::stream::FuturesOrdered; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; -use postgres_ffi::XLogFileName; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, }; +use safekeeper_api::models::PeerInfo; use tokio::fs::File; - use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{watch, OnceCell}; +use tokio::sync::{OnceCell, watch}; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; use tracing::*; - -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::backoff; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 5ecb23e8e0..049852a048 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -20,23 +20,23 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. use camino::Utf8PathBuf; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::RemotePath; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; - use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; -use utils::{id::NodeId, lsn::Lsn}; +use utils::id::NodeId; +use utils::lsn::Lsn; -use crate::{ - metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, - rate_limit::{rand_duration, RateLimiter}, - timeline::WalResidentTimeline, - timeline_manager::StateSnapshot, - wal_backup::{self}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::metrics::{ + MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager::StateSnapshot; +use crate::wal_backup::{self}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index a0dd571a34..cc9d4e6e3b 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,14 +1,15 @@ -use std::{ - pin::Pin, - task::{Context, Poll}, -}; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::Bytes; -use futures::{stream::BoxStream, Stream, StreamExt}; +use futures::stream::BoxStream; +use futures::{Stream, StreamExt}; +use safekeeper_api::Term; use utils::lsn::Lsn; -use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader}; -use safekeeper_api::Term; +use crate::send_wal::EndWatch; +use crate::timeline::WalResidentTimeline; +use crate::wal_storage::WalReader; #[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { @@ -224,12 +225,11 @@ mod tests { use futures::StreamExt; use postgres_ffi::MAX_SEND_SIZE; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; - use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_streaming_wal_reader_reset() { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index e5ccbb3230..045fa88cb0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,23 +2,23 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::{Context, Result}; -use postgres_backend::QueryError; -use safekeeper_api::models::ConnectionId; +use std::os::fd::AsRawFd; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, Result}; +use postgres_backend::{AuthType, PostgresBackend, QueryError}; +use safekeeper_api::models::ConnectionId; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{auth::Scope, measured_stream::MeasuredStream}; - -use std::os::fd::AsRawFd; +use utils::auth::Scope; +use utils::measured_stream::MeasuredStream; +use crate::handler::SafekeeperPostgresHandler; use crate::metrics::TrafficMetrics; -use crate::SafeKeeperConf; -use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; -use postgres_backend::{AuthType, PostgresBackend}; +use crate::{GlobalTimelines, SafeKeeperConf}; /// Accept incoming TCP connections and spawn them into a background thread. /// diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e338d70731..ed197a3f83 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,32 +7,32 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{bail, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::future::BoxFuture; -use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; -use remote_storage::RemotePath; use std::cmp::{max, min}; use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; -use tokio::fs::{self, remove_file, File, OpenOptions}; -use tokio::io::{AsyncRead, AsyncWriteExt}; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; + +use anyhow::{Context, Result, bail}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::future::BoxFuture; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use pq_proto::SystemId; +use remote_storage::RemotePath; +use tokio::fs::{self, File, OpenOptions, remove_file}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; use utils::crashsafe::durable_rename; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use crate::metrics::{ - time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, + REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::XLogFileName; -use pq_proto::SystemId; -use utils::{id::TenantTimelineId, lsn::Lsn}; pub trait Storage { // Last written LSN. @@ -200,7 +200,12 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { - bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn); + bail!( + "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", + ttid.timeline_id, + flush_lsn, + state.commit_lsn + ); } if flush_lsn < state.peer_horizon_lsn { warn!( diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e5b17a143..8e54d2bb86 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use tracing::{info, warn}; use utils::lsn::Lsn; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{ + Schedule, TestAction, TestConfig, generate_network_opts, generate_schedule, }; pub mod walproposer_sim; diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 1a932ef699..e29b58836a 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -1,11 +1,9 @@ use rand::Rng; use tracing::{info, warn}; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, TestConfig}, - simulation_logs::validate_events, -}; +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{TestConfig, generate_network_opts, generate_schedule}; +use crate::walproposer_sim::simulation_logs::validate_events; pub mod walproposer_sim; @@ -18,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let mut config = TestConfig::new(Some(clock)); for _ in 0..500 { - let seed: u64 = rand::thread_rng().gen(); + let seed: u64 = rand::thread_rng().r#gen(); config.network = generate_network_opts(seed); let test = config.start(seed); diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs index 0be9d0deef..f7b266e39c 100644 --- a/safekeeper/tests/simple_test.rs +++ b/safekeeper/tests/simple_test.rs @@ -1,7 +1,8 @@ use tracing::info; use utils::lsn::Lsn; -use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; +use crate::walproposer_sim::log::init_logger; +use crate::walproposer_sim::simulation::TestConfig; pub mod walproposer_sim; diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index 870f30de4f..e2ba3282ca 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -1,9 +1,11 @@ -use std::{fmt, sync::Arc}; +use std::fmt; +use std::sync::Arc; use desim::time::Timing; use once_cell::sync::OnceCell; use parking_lot::Mutex; -use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::time::FormatTime; /// SimClock can be plugged into tracing logger to print simulation time. #[derive(Clone)] diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0023a4d22a..6ce1a9940e 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -2,33 +2,30 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Bytes, BytesMut}; use camino::Utf8PathBuf; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; use http::Uri; -use safekeeper::{ - safekeeper::{ - ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION, - }, - state::{TimelinePersistentState, TimelineState}, - timeline::TimelineError, - wal_storage::Storage, - SafeKeeperConf, +use safekeeper::SafeKeeperConf; +use safekeeper::safekeeper::{ + ProposerAcceptorMessage, SK_PROTO_VERSION_3, SafeKeeper, UNKNOWN_SERVER_VERSION, }; -use safekeeper_api::{membership::Configuration, ServerInfo}; +use safekeeper::state::{TimelinePersistentState, TimelineState}; +use safekeeper::timeline::TimelineError; +use safekeeper::wal_storage::Storage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; use tracing::{debug, info_span, warn}; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; @@ -287,7 +284,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { @@ -403,7 +400,7 @@ impl ConnState { // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn let mut buf = BytesMut::with_capacity(128); - reply.serialize(&mut buf)?; + reply.serialize(&mut buf, SK_PROTO_VERSION_3)?; self.tcp.send(AnyMessage::Bytes(buf.into())); } diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index b854754ecf..94a849b5f0 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -1,22 +1,23 @@ use std::collections::HashMap; +use std::ops::Deref; use std::sync::Arc; - -use parking_lot::Mutex; -use safekeeper::state::TimelinePersistentState; -use utils::id::TenantTimelineId; - -use super::block_storage::BlockStorage; - -use std::{ops::Deref, time::Instant}; +use std::time::Instant; use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; -use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; -use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use parking_lot::Mutex; +use postgres_ffi::XLogSegNo; +use postgres_ffi::waldecoder::WalStreamDecoder; +use safekeeper::metrics::WalStorageMetrics; +use safekeeper::state::TimelinePersistentState; +use safekeeper::{control_file, wal_storage}; use tracing::{debug, info}; +use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use super::block_storage::BlockStorage; + /// All safekeeper state that is usually saved to disk. pub struct SafekeeperDisk { pub timelines: Mutex>>, diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index fabf450eef..f314143952 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -1,23 +1,24 @@ -use std::{cell::Cell, str::FromStr, sync::Arc}; +use std::cell::Cell; +use std::str::FromStr; +use std::sync::Arc; -use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; -use desim::{ - executor::{self, ExternalHandle}, - node_os::NodeOs, - options::{Delay, NetworkOptions}, - proto::{AnyMessage, NodeEvent}, - world::Node, - world::World, -}; +use desim::executor::{self, ExternalHandle}; +use desim::node_os::NodeOs; +use desim::options::{Delay, NetworkOptions}; +use desim::proto::{AnyMessage, NodeEvent}; +use desim::world::{Node, World}; use rand::{Rng, SeedableRng}; use tracing::{debug, info_span, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use walproposer::walproposer::{Config, Wrapper}; -use super::{ - log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, - walproposer_disk::DiskWalProposer, -}; +use super::log::SimClock; +use super::safekeeper_disk::SafekeeperDisk; +use super::walproposer_api; +use super::walproposer_disk::DiskWalProposer; +use crate::walproposer_sim::safekeeper::run_server; +use crate::walproposer_sim::walproposer_api::SimulationApi; /// Simulated safekeeper node. pub struct SafekeeperNode { diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5578c94cf6..6451589e80 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -1,26 +1,20 @@ -use std::{ - cell::{RefCell, RefMut, UnsafeCell}, - ffi::CStr, - sync::Arc, -}; +use std::cell::{RefCell, RefMut, UnsafeCell}; +use std::ffi::CStr; +use std::sync::Arc; use bytes::Bytes; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, - world::NodeId, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; +use desim::world::NodeId; use tracing::debug; use utils::lsn::Lsn; -use walproposer::{ - api_bindings::Level, - bindings::{ - NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, - }, - walproposer::{ApiImpl, Config}, +use walproposer::api_bindings::Level; +use walproposer::bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }; +use walproposer::walproposer::{ApiImpl, Config}; use super::walproposer_disk::DiskWalProposer; @@ -578,7 +572,9 @@ impl ApiImpl for SimulationApi { let disk_lsn = disk.lock().flush_rec_ptr().0; debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); if startpos < disk_lsn { - debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + debug!( + "startpos < disk_lsn, it means we wrote some transaction even before streaming started" + ); } assert!(startpos <= disk_lsn); let mut broadcasted = Lsn(startpos); diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index 7dc7f48548..fe3eee8a5a 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -1,4 +1,5 @@ -use std::{ffi::CStr, sync::Arc}; +use std::ffi::CStr; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator}; diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 17d4aed63b..e4db9a317d 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_broker" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 1a6fb7fedf..86f2dd9a6c 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -1,18 +1,14 @@ -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use clap::Parser; - -use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + FilterTenantTimelineId, MessageType, SafekeeperTimelineInfo, SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, }; - use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; - use tonic::Request; const ABOUT: &str = r#" diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9d4c22484c..cc33ec20ff 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -10,7 +10,14 @@ //! //! Only safekeeper message is supported, but it is not hard to add something //! else with generics. -use clap::{command, Parser}; +use std::collections::HashMap; +use std::convert::Infallible; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use clap::{Parser, command}; use futures_core::Stream; use futures_util::StreamExt; use http_body_util::Full; @@ -19,27 +26,10 @@ use hyper::header::CONTENT_TYPE; use hyper::service::service_fn; use hyper::{Method, StatusCode}; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use parking_lot::RwLock; -use std::collections::HashMap; -use std::convert::Infallible; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; -use std::time::Duration; -use tokio::net::TcpListener; -use tokio::sync::broadcast; -use tokio::sync::broadcast::error::RecvError; -use tokio::time; -use tonic::body::{self, empty_body, BoxBody}; -use tonic::codegen::Service; -use tonic::Code; -use tonic::{Request, Response, Status}; -use tracing::*; -use utils::signals::ShutdownSignals; - use metrics::{Encoder, TextEncoder}; +use parking_lot::RwLock; use storage_broker::metrics::{ - BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + BROADCAST_DROPPED_MESSAGES_TOTAL, BROADCASTED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, }; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; @@ -48,10 +38,19 @@ use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, }; -use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR}; +use storage_broker::{DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, parse_proto_ttid}; +use tokio::net::TcpListener; +use tokio::sync::broadcast; +use tokio::sync::broadcast::error::RecvError; +use tokio::time; +use tonic::body::{self, BoxBody, empty_body}; +use tonic::codegen::Service; +use tonic::{Code, Request, Response, Status}; +use tracing::*; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::sentry_init::init_sentry; +use utils::signals::ShutdownSignals; use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -743,11 +742,12 @@ async fn main() -> Result<(), Box> { #[cfg(test)] mod tests { - use super::*; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; + use super::*; + fn msg(timeline_id: Vec) -> Message { Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 3ac40f6e14..55d411f607 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,12 +1,11 @@ use std::time::Duration; -use tonic::codegen::StdError; -use tonic::transport::{ClientTlsConfig, Endpoint}; -use tonic::{transport::Channel, Status}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; -use proto::{ - broker_service_client::BrokerServiceClient, TenantTimelineId as ProtoTenantTimelineId, -}; +use proto::TenantTimelineId as ProtoTenantTimelineId; +use proto::broker_service_client::BrokerServiceClient; +use tonic::Status; +use tonic::codegen::StdError; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; // Code generated by protobuf. pub mod proto { @@ -20,11 +19,8 @@ pub mod proto { pub mod metrics; // Re-exports to avoid direct tonic dependency in user crates. -pub use tonic::Code; -pub use tonic::Request; -pub use tonic::Streaming; - pub use hyper::Uri; +pub use tonic::{Code, Request, Streaming}; pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index 1fd3dd5ad6..ecfb594eba 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; +use metrics::{IntCounter, IntGauge, register_int_counter, register_int_gauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 73dc1a5c10..b63ba154da 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_controller" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] @@ -18,12 +18,14 @@ anyhow.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true +cron.workspace = true fail.workspace = true futures.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true itertools.workspace = true +json-structural-diff.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql new file mode 100644 index 0000000000..0f051d3ac3 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP listen_https_port; diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql new file mode 100644 index 0000000000..172237d477 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD listen_https_port INTEGER; diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 226d4942e7..a630316f46 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, fmt::Debug, fmt::Display}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 5bc3c81f02..b602af362d 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::error::Error as _; use std::sync::Arc; -use std::{collections::HashMap, time::Duration}; +use std::time::Duration; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; @@ -12,11 +13,9 @@ use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShar use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; -use utils::{ - backoff::{self}, - id::{NodeId, TenantId}, -}; +use tracing::{Instrument, info_span}; +use utils::backoff::{self}; +use utils::id::{NodeId, TenantId}; use crate::service::Config; diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 8b7be88078..bd4b8ba38f 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -1,15 +1,14 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy}; -use utils::{id::NodeId, shard::TenantShardId}; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use crate::{ - background_node_operations::OperationError, node::Node, scheduler::Scheduler, - tenant_shard::TenantShard, -}; +use crate::background_node_operations::OperationError; +use crate::node::Node; +use crate::scheduler::Scheduler; +use crate::tenant_shard::TenantShard; pub(crate) struct TenantShardIterator { tenants_accessor: F, @@ -188,10 +187,8 @@ impl TenantShardDrain { mod tests { use std::sync::Arc; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::TenantShardIterator; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 52b6110667..56a331becd 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,24 +1,22 @@ -use futures::{stream::FuturesUnordered, StreamExt}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; +use pageserver_api::models::PageserverUtilization; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; -use std::{ - collections::HashMap, - fmt::Debug, - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio_util::sync::CancellationToken; - -use pageserver_api::{ - controller_api::{NodeAvailability, SkSchedulingPolicy}, - models::PageserverUtilization, -}; - use thiserror::Error; -use utils::{id::NodeId, logging::SecretString}; +use tokio_util::sync::CancellationToken; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{node::Node, safekeeper::Safekeeper}; +use crate::node::Node; +use crate::safekeeper::Safekeeper; struct HeartbeaterTask { receiver: tokio::sync::mpsc::UnboundedReceiver>, @@ -223,21 +221,21 @@ impl HeartBeat for HeartbeaterTask Some((*node_id, status)) } }); + } - loop { - let maybe_status = tokio::select! { - next = heartbeat_futs.next() => { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } @@ -363,21 +361,21 @@ impl HeartBeat for HeartbeaterTask { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 1cc61a12e8..5b5ae80eaf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,32 +1,27 @@ -use crate::http; -use crate::metrics::{ - HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, - METRICS_REGISTRY, -}; -use crate::persistence::SafekeeperUpsert; -use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use futures::Future; -use http_utils::{ - endpoint::{ - self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, - request_span, - }, - error::ApiError, - failpoints::failpoints_handler, - json::{json_request, json_response}, - request::{must_get_query_param, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, +use http_utils::endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, }; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{must_get_query_param, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; use hyper::header::CONTENT_TYPE; -use hyper::{Body, Request, Response}; -use hyper::{StatusCode, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, @@ -34,23 +29,21 @@ use pageserver_api::models::{ TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::{mgmt_api, BlockUnblock}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::{Duration, Instant}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use routerify::Middleware; use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; use utils::id::{NodeId, TenantId, TimelineId}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, - TenantShardMigrateRequest, +use crate::http; +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, METRICS_REGISTRY, + PageserverRequestLabelGroup, }; -use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; - -use routerify::Middleware; +use crate::persistence::SafekeeperUpsert; +use crate::reconciler::ReconcileError; +use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service}; /// State available to HTTP request handlers pub struct HttpState { @@ -598,7 +591,10 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); - let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let client = mgmt_api::Client::new( + node.base_url(), + service.get_config().pageserver_jwt_token.as_deref(), + ); let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. @@ -1354,10 +1350,7 @@ async fn handle_safekeeper_scheduling_policy( .set_safekeeper_scheduling_policy(id, body.scheduling_policy) .await?; - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty()) - .unwrap()) + json_response(StatusCode::OK, ()) } /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only @@ -1455,8 +1448,8 @@ pub fn prologue_leadership_status_check_middleware< }) } -fn prologue_metrics_middleware( -) -> Middleware { +fn prologue_metrics_middleware() +-> Middleware { Middleware::pre(move |req| async move { let meta = RequestMeta { method: req.method().clone(), @@ -1469,8 +1462,8 @@ fn prologue_metrics_middleware }) } -fn epilogue_metrics_middleware( -) -> Middleware { +fn epilogue_metrics_middleware() +-> Middleware { Middleware::post_with_info(move |resp, req_info| async move { let request_name = match req_info.context::() { Some(name) => name, @@ -1621,8 +1614,8 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( anyhow::anyhow!( - "Failed to parse leader uri for forwarding while in stepped down state: {err}" - ), + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), ))); } }; @@ -2155,8 +2148,23 @@ mod test { #[test] fn test_path_without_ids() { - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/"); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo" + ), + "/v1/tenant//timeline/" + ); } } diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index 2d8b674f86..6b0c16f0be 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -1,8 +1,7 @@ +use std::collections::HashMap; use std::fmt::Display; -use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; - -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, Instant}; use crate::service::RECONCILE_TIMEOUT; diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5fae8991ec..5e1d6f3ec9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -3,11 +3,9 @@ use std::sync::Arc; use hyper::Uri; use tokio_util::sync::CancellationToken; -use crate::{ - peer_client::{GlobalObservedState, PeerClient}, - persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, - service::Config, -}; +use crate::peer_client::{GlobalObservedState, PeerClient}; +use crate::persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}; +use crate::service::Config; /// Helper for storage controller leadership acquisition pub(crate) struct Leadership { @@ -91,7 +89,9 @@ impl Leadership { // Special case: if this is a brand new storage controller, migrations will not // have run at this point yet, and, hence, the controllers table does not exist. // Detect this case via the error string (diesel doesn't type it) and allow it. - tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + tracing::info!( + "Detected first storage controller start-up. Allowing missing controllers table ..." + ); return Ok(None); } } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 9a9958f7a6..04dd3bb3f6 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,26 +1,26 @@ -use anyhow::{anyhow, Context}; -use clap::Parser; -use hyper0::Uri; -use metrics::launch_timestamp::LaunchTimestamp; -use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, anyhow}; +use clap::Parser; +use hyper0::Uri; +use metrics::BuildInfo; +use metrics::launch_timestamp::LaunchTimestamp; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; - use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; @@ -34,7 +34,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; #[derive(Parser)] @@ -53,6 +53,10 @@ struct Cli { #[arg(long)] jwt_token: Option, + /// Token for authenticating this service with the safekeepers it controls + #[arg(long)] + safekeeper_jwt_token: Option, + /// Token for authenticating this service with the control plane, when calling /// the compute notification endpoint #[arg(long)] @@ -111,10 +115,14 @@ struct Cli { #[arg(long)] neon_local_repo_dir: Option, - /// Chaos testing + /// Chaos testing: exercise tenant migrations #[arg(long)] chaos_interval: Option, + /// Chaos testing: exercise an immediate exit + #[arg(long)] + chaos_exit_crontab: Option, + // Maximum acceptable lag for the secondary location while draining // a pageserver #[arg(long)] @@ -126,6 +134,10 @@ struct Cli { #[arg(long)] long_reconcile_threshold: Option, + + // Flag to use https for requests to pageserver API. + #[arg(long, default_value = "false")] + use_https_pageserver_api: bool, } enum StrictMode { @@ -149,7 +161,8 @@ impl Default for StrictMode { struct Secrets { database_url: String, public_key: Option, - jwt_token: Option, + pageserver_jwt_token: Option, + safekeeper_jwt_token: Option, control_plane_jwt_token: Option, peer_jwt_token: Option, } @@ -157,6 +170,7 @@ struct Secrets { impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; @@ -180,7 +194,14 @@ impl Secrets { let this = Self { database_url, public_key, - jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), + pageserver_jwt_token: Self::load_secret( + &args.jwt_token, + Self::PAGESERVER_JWT_TOKEN_ENV, + ), + safekeeper_jwt_token: Self::load_secret( + &args.safekeeper_jwt_token, + Self::SAFEKEEPER_JWT_TOKEN_ENV, + ), control_plane_jwt_token: Self::load_secret( &args.control_plane_jwt_token, Self::CONTROL_PLANE_JWT_TOKEN_ENV, @@ -260,18 +281,24 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; + // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below + tracing::info!( + "safekeeper_jwt_token set: {:?}", + secrets.safekeeper_jwt_token.is_some() + ); + // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() - || secrets.jwt_token.is_none() + || secrets.pageserver_jwt_token.is_none() || secrets.control_plane_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set // then we would implicitly disable auth. anyhow::bail!( - "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" - ); + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); } StrictMode::Strict if args.compute_hook_url.is_none() => { // Production systems should always have a compute hook set, to prevent falling @@ -289,7 +316,8 @@ async fn async_main() -> anyhow::Result<()> { } let config = Config { - jwt_token: secrets.jwt_token, + pageserver_jwt_token: secrets.pageserver_jwt_token, + safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, @@ -321,6 +349,7 @@ async fn async_main() -> anyhow::Result<()> { address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, + use_https_pageserver_api: args.use_https_pageserver_api, }; // Validate that we can connect to the database @@ -357,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> { let service = service.clone(); let cancel = CancellationToken::new(); let cancel_bg = cancel.clone(); + let chaos_exit_crontab = args.chaos_exit_crontab; ( tokio::task::spawn( async move { - let mut chaos_injector = ChaosInjector::new(service, interval.into()); + let mut chaos_injector = + ChaosInjector::new(service, interval.into(), chaos_exit_crontab); chaos_injector.run(cancel_bg).await } .instrument(tracing::info_span!("chaos_injector")), diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d67e0d130..f490edb68f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -7,17 +7,18 @@ //! //! The rest of the code defines label group types and deals with converting outer types to labels. //! +use std::sync::Mutex; + use bytes::Bytes; -use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use measured::label::LabelValue; +use measured::metric::histogram; +use measured::{FixedCardinalityLabel, MetricGroup}; use metrics::NeonMetrics; use once_cell::sync::Lazy; -use std::sync::Mutex; use strum::IntoEnumIterator; -use crate::{ - persistence::{DatabaseError, DatabaseOperation}, - service::LeadershipStatus, -}; +use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::service::LeadershipStatus; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index f5c2d329e0..bc7fe8802a 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,21 +1,22 @@ -use std::{str::FromStr, time::Duration}; +use std::str::FromStr; +use std::time::Duration; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, - NodeSchedulingPolicy, TenantLocateResponseShard, - }, - shard::TenantShardId, +use anyhow::anyhow; +use pageserver_api::controller_api::{ + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + NodeSchedulingPolicy, TenantLocateResponseShard, }; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId}; +use utils::backoff; +use utils::id::NodeId; -use crate::{ - pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, -}; +use crate::pageserver_client::PageserverClient; +use crate::persistence::NodePersistence; +use crate::scheduler::MaySchedule; /// Represents the in-memory description of a Node. /// @@ -32,12 +33,16 @@ pub(crate) struct Node { listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, + // Flag from storcon's config to use https for pageserver admin API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition { impl Node { pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on") + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -82,11 +96,20 @@ impl Node { self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port + // Note: listen_https_port may change. See [`Self::need_update`] for mode details. + // && self.listen_https_port == register_req.listen_https_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port && self.availability_zone_id == register_req.availability_zone_id } + // Do we need to update an existing record in DB on this registration request? + pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool { + // listen_https_port is checked here because it may change during migration to https. + // After migration, this check may be moved to registration_match. + self.listen_https_port != register_req.listen_https_port + } + /// For a shard located on this node, populate a response object /// with this node's address information. pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { @@ -95,6 +118,7 @@ impl Node { node_id: self.id, listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } @@ -175,25 +199,34 @@ impl Node { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn new( id: NodeId, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, - ) -> Self { - Self { + use_https: bool, + ) -> anyhow::Result { + if use_https && listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id, listen_http_addr, listen_http_port, + listen_https_port, listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, availability_zone_id, + use_https, cancel: CancellationToken::new(), - } + }) } pub(crate) fn to_persistent(&self) -> NodePersistence { @@ -202,14 +235,19 @@ impl Node { scheduling_policy: self.scheduling.into(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port as i32, + listen_https_port: self.listen_https_port.map(|x| x as i32), listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, availability_zone_id: self.availability_zone_id.0.clone(), } } - pub(crate) fn from_persistent(np: NodePersistence) -> Self { - Self { + pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { + if use_https && np.listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id: NodeId(np.node_id as u64), // At startup we consider a node offline until proven otherwise. availability: NodeAvailability::Offline, @@ -217,11 +255,13 @@ impl Node { .expect("Bad scheduling policy in DB"), listen_http_addr: np.listen_http_addr, listen_http_port: np.listen_http_port as u16, + listen_https_port: np.listen_https_port.map(|x| x as u16), listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, availability_zone_id: AvailabilityZone(np.availability_zone_id), + use_https, cancel: CancellationToken::new(), - } + }) } /// Wrapper for issuing requests to pageserver management API: takes care of generic @@ -285,8 +325,9 @@ impl Node { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to node {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -302,6 +343,7 @@ impl Node { availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 645cbdfce1..e9c54414a3 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,17 +1,13 @@ -use pageserver_api::{ - models::{ - detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, - PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, - TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, - TopTenantShardsResponse, - }, - shard::TenantShardId, -}; -use pageserver_client::{ - mgmt_api::{Client, Result}, - BlockUnblock, +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }; +use pageserver_api::shard::TenantShardId; +use pageserver_client::BlockUnblock; +use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index 1a15bae365..f3f275dee0 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,16 +1,17 @@ -use crate::tenant_shard::ObservedState; -use pageserver_api::shard::TenantShardId; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::error::Error as _; use std::time::Duration; -use tokio_util::sync::CancellationToken; use http_utils::error::HttpErrorBody; use hyper::Uri; +use pageserver_api::shard::TenantShardId; use reqwest::{StatusCode, Url}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use utils::backoff; +use crate::tenant_shard::ObservedState; + #[derive(Debug, Clone)] pub(crate) struct PeerClient { uri: Uri, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 67b60eadf3..2e80b48859 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2,45 +2,40 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; -use self::split_state::SplitState; +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::pg::Pg; use diesel::prelude::*; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::pooled_connection::bb8::Pool; -use diesel_async::pooled_connection::AsyncDieselConnectionManager; -use diesel_async::pooled_connection::ManagerConfig; -use diesel_async::AsyncPgConnection; -use diesel_async::RunQueryDsl; -use futures::future::BoxFuture; +use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use diesel_migrations::{EmbeddedMigrations, embed_migrations}; use futures::FutureExt; +use futures::future::BoxFuture; use itertools::Itertools; -use pageserver_api::controller_api::AvailabilityZone; -use pageserver_api::controller_api::MetadataHealthRecord; -use pageserver_api::controller_api::SafekeeperDescribeResponse; -use pageserver_api::controller_api::ShardSchedulingPolicy; -use pageserver_api::controller_api::SkSchedulingPolicy; -use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy, +}; use pageserver_api::models::TenantConfig; -use pageserver_api::shard::ShardConfigError; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; -use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; +use pageserver_api::shard::{ + ShardConfigError, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; use rustls::client::WebPkiServerVerifier; +use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; +use self::split_state::SplitState; use crate::metrics::{ DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, }; use crate::node::Node; - -use diesel_migrations::{embed_migrations, EmbeddedMigrations}; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// ## What do we store? @@ -375,18 +370,23 @@ impl Persistence { Ok(nodes) } - pub(crate) async fn update_node( + pub(crate) async fn update_node( &self, input_node_id: NodeId, - input_scheduling: NodeSchedulingPolicy, - ) -> DatabaseResult<()> { + values: V, + ) -> DatabaseResult<()> + where + V: diesel::AsChangeset + Clone + Send + Sync, + V::Changeset: diesel::query_builder::QueryFragment + Send, // valid Postgres SQL + { use crate::schema::nodes::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { + let values = values.clone(); Box::pin(async move { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) - .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .set(values) .execute(conn) .await?; Ok(updated) @@ -403,6 +403,32 @@ impl Persistence { } } + pub(crate) async fn update_node_scheduling_policy( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + scheduling_policy.eq(String::from(input_scheduling)), + ) + .await + } + + pub(crate) async fn update_node_on_registration( + &self, + input_node_id: NodeId, + input_https_port: Option, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + listen_https_port.eq(input_https_port.map(|x| x as i32)), + ) + .await + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. /// @@ -448,8 +474,7 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::metadata_health; - use crate::schema::tenant_shards; + use crate::schema::{metadata_health, tenant_shards}; let now = chrono::Utc::now(); @@ -523,8 +548,7 @@ impl Persistence { &self, input_node_id: NodeId, ) -> DatabaseResult> { - use crate::schema::nodes::dsl::scheduling_policy; - use crate::schema::nodes::dsl::*; + use crate::schema::nodes::dsl::{scheduling_policy, *}; use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { @@ -1452,6 +1476,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, pub(crate) availability_zone_id: String, + pub(crate) listen_https_port: Option, } /// Tenant metadata health status that are stored durably. @@ -1533,7 +1558,33 @@ pub(crate) struct SafekeeperPersistence { pub(crate) port: i32, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, - pub(crate) scheduling_policy: String, + pub(crate) scheduling_policy: SkSchedulingPolicyFromSql, +} + +/// Wrapper struct around [`SkSchedulingPolicy`] because both it and [`FromSql`] are from foreign crates, +/// and we don't want to make [`safekeeper_api`] depend on [`diesel`]. +#[derive(Serialize, Deserialize, FromSqlRow, Eq, PartialEq, Debug, Copy, Clone)] +pub(crate) struct SkSchedulingPolicyFromSql(pub(crate) SkSchedulingPolicy); + +impl From for SkSchedulingPolicyFromSql { + fn from(value: SkSchedulingPolicy) -> Self { + SkSchedulingPolicyFromSql(value) + } +} + +impl FromSql for SkSchedulingPolicyFromSql { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match SkSchedulingPolicy::from_str(s) { + Ok(policy) => Ok(SkSchedulingPolicyFromSql(policy)), + Err(e) => Err(format!("can't parse: {e}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for scheduling policy: {e}").into()), + } + } } impl SafekeeperPersistence { @@ -1549,14 +1600,10 @@ impl SafekeeperPersistence { port: upsert.port, http_port: upsert.http_port, availability_zone_id: upsert.availability_zone_id, - scheduling_policy: String::from(scheduling_policy), + scheduling_policy: SkSchedulingPolicyFromSql(scheduling_policy), } } pub(crate) fn as_describe_response(&self) -> Result { - let scheduling_policy = - SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { - DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) - })?; Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1565,7 +1612,7 @@ impl SafekeeperPersistence { port: self.port, http_port: self.http_port, availability_zone_id: self.availability_zone_id.clone(), - scheduling_policy, + scheduling_policy: self.scheduling_policy.0, }) } } diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs index bce1a75843..f83191038a 100644 --- a/storage_controller/src/persistence/split_state.rs +++ b/storage_controller/src/persistence/split_state.rs @@ -1,8 +1,8 @@ +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; use diesel::pg::{Pg, PgValue}; -use diesel::{ - deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, - sql_types::Int2, -}; +use diesel::serialize::ToSql; +use diesel::sql_types::Int2; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 48f0804926..a327f6f50f 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,9 @@ -use crate::pageserver_client::PageserverClient; -use crate::persistence::Persistence; -use crate::{compute_hook, service}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, @@ -8,10 +11,6 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; -use std::borrow::Cow; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; use utils::generation::Generation; @@ -22,9 +21,12 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; +use crate::{compute_hook, service}; -const DEFAULT_HEATMAP_PERIOD: &str = "60s"; +const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. @@ -296,7 +298,7 @@ impl Reconciler { .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, timeout, @@ -417,7 +419,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); client @@ -440,7 +442,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); let timelines = client.timeline_list(&tenant_shard_id).await?; @@ -478,7 +480,7 @@ impl Reconciler { ) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, request_download_timeout * 2, @@ -510,7 +512,8 @@ impl Reconciler { } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); if total_runtime > total_download_timeout { - tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + tracing::warn!( + "Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, progress.layers_total, @@ -771,7 +774,7 @@ impl Reconciler { let observed_conf = match attached_node .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 1, Duration::from_secs(5), @@ -880,7 +883,27 @@ impl Reconciler { self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + let diff = match observed { + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let diff = JsonDiff::diff( + &serde_json::to_value(observed.clone()).unwrap(), + &serde_json::to_value(wanted_conf.clone()).unwrap(), + false, + ); + + if let Some(json_diff) = diff.diff { + serde_json::to_string(&json_diff).unwrap_or("diff err".to_string()) + } else { + "unknown".to_string() + } + } + _ => "full".to_string(), + }; + + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}"); // Because `node` comes from a ref to &self, clone it before calling into a &mut self // function: this could be avoided by refactoring the state mutated by location_config into @@ -1099,7 +1122,7 @@ impl Reconciler { match origin .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, Duration::from_secs(5), @@ -1180,7 +1203,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig let mut config = config.clone(); if has_secondaries { if config.heatmap_period.is_none() { - config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD); } } else { config.heatmap_period = None; diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 53cd8a908b..9c7e6e0894 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -1,16 +1,16 @@ -use std::{str::FromStr, time::Duration}; +use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use reqwest::StatusCode; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId, logging::SecretString}; +use utils::backoff; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{ - heartbeater::SafekeeperState, - persistence::{DatabaseError, SafekeeperPersistence}, - safekeeper_client::SafekeeperClient, -}; +use crate::heartbeater::SafekeeperState; +use crate::persistence::{DatabaseError, SafekeeperPersistence}; +use crate::safekeeper_client::SafekeeperClient; #[derive(Clone)] pub struct Safekeeper { @@ -25,7 +25,7 @@ pub struct Safekeeper { impl Safekeeper { pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { - let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); + let scheduling_policy = skp.scheduling_policy.0; Self { cancel, listen_http_addr: skp.host.clone(), @@ -54,7 +54,7 @@ impl Safekeeper { } pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { self.scheduling_policy = scheduling_policy; - self.skp.scheduling_policy = String::from(scheduling_policy); + self.skp.scheduling_policy = scheduling_policy.into(); } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index f234ab3429..fb5be092a0 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,13 +1,12 @@ -use crate::metrics::PageserverRequestLabelGroup; use safekeeper_api::models::{ PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use safekeeper_client::mgmt_api::{Client, Result}; -use utils::{ - id::{NodeId, TenantId, TimelineId}, - logging::SecretString, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use crate::metrics::PageserverRequestLabelGroup; /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 106a7b2699..817cf04fe1 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,11 +1,17 @@ -use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; +use std::collections::HashMap; +use std::fmt::Debug; + use http_utils::error::ApiError; use itertools::Itertools; -use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; +use pageserver_api::controller_api::AvailabilityZone; +use pageserver_api::models::PageserverUtilization; use serde::Serialize; -use std::{collections::HashMap, fmt::Debug}; use utils::id::NodeId; +use crate::metrics::NodeLabelGroup; +use crate::node::Node; +use crate::tenant_shard::TenantShard; + /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] pub enum ScheduleError { @@ -775,10 +781,10 @@ impl Scheduler { if !matches!(context.mode, ScheduleMode::Speculative) { tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", - scores.iter().map(|i| i.node_id().0).collect::>(), - preferred_az, - ); + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", + scores.iter().map(|i| i.node_id().0).collect::>(), + preferred_az, + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -906,14 +912,14 @@ impl Scheduler { #[cfg(test)] pub(crate) mod test_utils { - use crate::node::Node; - use pageserver_api::{ - controller_api::{AvailabilityZone, NodeAvailability}, - models::utilization::test_utilization, - }; use std::collections::HashMap; + + use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability}; + use pageserver_api::models::utilization::test_utilization; use utils::id::NodeId; + use crate::node::Node; + /// Test helper: synthesize the requested number of nodes, all in active state. /// /// Node IDs start at one. @@ -930,13 +936,16 @@ pub(crate) mod test_utils { NodeId(i), format!("httphost-{i}"), 80 + i as u16, + None, format!("pghost-{i}"), 5432 + i as u16, az_iter .next() .cloned() .unwrap_or(AvailabilityZone("test-az".to_string())), - ); + false, + ) + .unwrap(); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); node @@ -948,17 +957,13 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{ - controller_api::NodeAvailability, models::utilization::test_utilization, - shard::ShardIdentity, - }; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::models::utilization::test_utilization; + use pageserver_api::shard::ShardIdentity; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::*; - use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 14c30c296d..361253bd19 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -26,6 +26,7 @@ diesel::table! { listen_pg_addr -> Varchar, listen_pg_port -> Int4, availability_zone_id -> Varchar, + listen_https_port -> Nullable, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index fc6d2f3d29..8671e340bd 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,112 +1,95 @@ pub mod chaos_injector; mod context_iterator; -use hyper::Uri; -use safekeeper_api::models::SafekeeperUtilization; -use std::{ - borrow::Cow, - cmp::Ordering, - collections::{BTreeMap, HashMap, HashSet}, - error::Error, - ops::Deref, - path::PathBuf, - str::FromStr, - sync::Arc, - time::{Duration, Instant}, -}; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::error::Error; +use std::ops::Deref; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; -use crate::{ - background_node_operations::{ - Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, - }, - compute_hook::{self, NotifyError}, - drain_utils::{self, TenantShardDrain, TenantShardIterator}, - heartbeater::SafekeeperState, - id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - leadership::Leadership, - metrics, - peer_client::GlobalObservedState, - persistence::{ - AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - ShardGenerationState, TenantFilter, - }, - reconciler::{ - ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, - ReconcilerPriority, - }, - safekeeper::Safekeeper, - scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, - tenant_shard::{ - MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, - ScheduleOptimization, ScheduleOptimizationAction, - }, -}; use anyhow::Context; +use context_iterator::TenantShardContextIterator; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; use diesel::result::DatabaseErrorKind; -use futures::{stream::FuturesUnordered, StreamExt}; -use itertools::Itertools; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, - NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest, - TimelineArchivalConfigRequest, TopTenantShardsRequest, - }, -}; -use reqwest::StatusCode; -use tracing::{instrument, Instrument}; - -use crate::pageserver_client::PageserverClient; +use futures::StreamExt; +use futures::stream::FuturesUnordered; use http_utils::error::ApiError; -use pageserver_api::{ - models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, - TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, - TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, - }, +use hyper::Uri; +use itertools::Itertools; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; -use pageserver_client::{mgmt_api, BlockUnblock}; -use tokio::sync::{mpsc::error::TrySendError, TryAcquireError}; +use pageserver_api::models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, + SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, + TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, + TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, +}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, +}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use reqwest::StatusCode; +use safekeeper_api::models::SafekeeperUtilization; +use tokio::sync::TryAcquireError; +use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; -use utils::{ - completion::Barrier, - failpoint_support, - generation::Generation, - id::{NodeId, TenantId, TimelineId}, - pausable_failpoint, - sync::gate::Gate, -}; +use tracing::{Instrument, instrument}; +use utils::completion::Barrier; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::sync::gate::Gate; +use utils::{failpoint_support, pausable_failpoint}; -use crate::{ - compute_hook::ComputeHook, - heartbeater::{Heartbeater, PageserverState}, - node::{AvailabilityTransition, Node}, - persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, - reconciler::attached_location_conf, - scheduler::Scheduler, - tenant_shard::{ - IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantShard, - }, +use crate::background_node_operations::{ + Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, +}; +use crate::compute_hook::{self, ComputeHook, NotifyError}; +use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator}; +use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState}; +use crate::id_lock_map::{ + IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock, +}; +use crate::leadership::Leadership; +use crate::metrics; +use crate::node::{AvailabilityTransition, Node}; +use crate::pageserver_client::PageserverClient; +use crate::peer_client::GlobalObservedState; +use crate::persistence::split_state::SplitState; +use crate::persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, + MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + TenantShardPersistence, +}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, + attached_location_conf, +}; +use crate::safekeeper::Safekeeper; +use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler}; +use crate::tenant_shard::{ + IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation, + ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter, + ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; - -use context_iterator::TenantShardContextIterator; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -348,7 +331,12 @@ pub struct Config { // All pageservers managed by one instance of this service must have // the same public key. This JWT token will be used to authenticate // this service to the pageservers it manages. - pub jwt_token: Option, + pub pageserver_jwt_token: Option, + + // All safekeepers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the safekeepers it manages. + pub safekeeper_jwt_token: Option, // This JWT token will be used to authenticate this service to the control plane. pub control_plane_jwt_token: Option, @@ -399,6 +387,8 @@ pub struct Config { pub http_service_port: i32, pub long_reconcile_threshold: Duration, + + pub use_https_pageserver_api: bool, } impl From for ApiError { @@ -780,7 +770,9 @@ impl Service { }); } - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + tracing::info!( + "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)" + ); } async fn initial_heartbeat_round<'a>( @@ -880,7 +872,7 @@ impl Service { let response = node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, timeout, @@ -981,7 +973,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); match client .location_config( @@ -1175,7 +1167,9 @@ impl Service { let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { let Some(sk) = safekeepers.get_mut(&id) else { - tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"); + tracing::info!( + "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" + ); continue; }; sk.set_availability(state); @@ -1401,8 +1395,8 @@ impl Service { .list_nodes() .await? .into_iter() - .map(Node::from_persistent) - .collect::>(); + .map(|x| Node::from_persistent(x, config.use_https_pageserver_api)) + .collect::>>()?; let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); metrics::METRICS_REGISTRY @@ -1501,10 +1495,13 @@ impl Service { NodeId(node_id as u64), "".to_string(), 123, + None, "".to_string(), 123, AvailabilityZone("test_az".to_string()), - ); + false, + ) + .unwrap(); scheduler.node_upsert(&node); } @@ -1527,7 +1524,9 @@ impl Service { // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations // on different pageservers. - tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + tracing::warn!( + "Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled" + ); } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1548,14 +1547,14 @@ impl Service { let reconcilers_cancel = cancel.child_token(); let heartbeater_ps = Heartbeater::new( - config.jwt_token.clone(), + config.pageserver_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), ); let heartbeater_sk = Heartbeater::new( - config.jwt_token.clone(), + config.safekeeper_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1857,7 +1856,7 @@ impl Service { } Ok(AttachHookResponse { - gen: attach_req + r#gen: attach_req .node_id .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) @@ -1902,7 +1901,7 @@ impl Service { let configs = match node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1960,7 +1959,7 @@ impl Service { .location_config(tenant_shard_id, config, None, false) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -2029,7 +2028,7 @@ impl Service { let new_gen = *new_gen; response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: Some(new_gen.into().unwrap()), + r#gen: Some(new_gen.into().unwrap()), // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] // execution. If a pageserver is restarted during that process, then the reconcile pass will // fail, and start from scratch, so it doesn't make sense for us to try and preserve @@ -2066,7 +2065,7 @@ impl Service { response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: None, + r#gen: None, mode: LocationConfigMode::Secondary, }); @@ -2128,15 +2127,19 @@ impl Service { let locked = self.inner.read().unwrap(); for req_tenant in validate_req.tenants { if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.r#gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, - req_tenant.gen, + req_tenant.r#gen, tenant_shard.generation ); - in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + in_memory_result.push(( + req_tenant.id, + Generation::new(req_tenant.r#gen), + valid, + )); } else { // This is legal: for example during a shard split the pageserver may still // have deletions in its queue from the old pre-split shard, or after deletion @@ -2155,13 +2158,11 @@ impl Service { // in case of controller split-brain, where some other controller process might have incremented the generation. let db_generations = self .persistence - .shard_generations(in_memory_result.iter().filter_map(|i| { - if i.2 { - Some(&i.0) - } else { - None - } - })) + .shard_generations( + in_memory_result + .iter() + .filter_map(|i| if i.2 { Some(&i.0) } else { None }), + ) .await?; let db_generations = db_generations.into_iter().collect::>(); @@ -2313,7 +2314,9 @@ impl Service { // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, // if we see a unique key violation it means that the creation request's shard count matches the previous // creation's shard count. - tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + tracing::info!( + "Tenant shards already present in database, proceeding with idempotent creation..." + ); } // Any other database error is unexpected and a bug. Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), @@ -2916,7 +2919,9 @@ impl Service { first }; - let updated_config = base.apply_patch(patch); + let updated_config = base + .apply_patch(patch) + .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?; self.set_tenant_config_and_reconcile(tenant_id, updated_config) .await } @@ -2992,7 +2997,7 @@ impl Service { None => { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), - )) + )); } } }; @@ -3059,7 +3064,9 @@ impl Service { }) .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); if let Some((node_id, _observed_location, mode)) = maybe_attached { - return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}" + ))); } } let scheduler = &mut locked.scheduler; @@ -3095,7 +3102,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3156,7 +3163,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); futs.push(async move { let result = client @@ -3279,7 +3286,7 @@ impl Service { .tenant_delete(TenantShardId::unsharded(tenant_id)) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, RECONCILE_TIMEOUT, @@ -3498,7 +3505,7 @@ impl Service { let timeline_info = create_one( shard_zero_tid, shard_zero_locations, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), create_req.clone(), ) .await?; @@ -3514,7 +3521,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.0.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = &self.config.jwt_token; + let jwt = &self.config.pageserver_jwt_token; self.tenant_for_shards( targets .0 @@ -3597,7 +3604,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), req.clone(), )) }) @@ -3678,7 +3685,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -3752,7 +3759,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), dir, )) }) @@ -3867,7 +3874,7 @@ impl Service { futs.push(async move { node.with_client_retries( |client| op(tenant_shard_id, client), - &self.config.jwt_token, + &self.config.pageserver_jwt_token, warn_threshold, max_retries, timeout, @@ -3932,7 +3939,9 @@ impl Service { // This can only happen if there is a split brain controller modifying the database. This should // never happen when testing, and if it happens in production we can only log the issue. debug_assert!(false); - tracing::error!("Shard {shard_id} not found in generation state! Is another rogue controller running?"); + tracing::error!( + "Shard {shard_id} not found in generation state! Is another rogue controller running?" + ); continue; }; let (generation, generation_pageserver) = generation; @@ -3941,13 +3950,17 @@ impl Service { // This is legitimate only in a very narrow window where the shard was only just configured into // Attached mode after being created in Secondary or Detached mode, and it has had its generation // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver). - tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?"); + tracing::warn!( + "Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?" + ); } } else { // This should never happen: a shard with no generation is only permitted when it was created in some state // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory) debug_assert!(false); - tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!"); + tracing::error!( + "Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!" + ); continue; } } @@ -4116,7 +4129,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -4138,7 +4151,7 @@ impl Service { shard_zero_tid, timeline_id, shard_zero_locations.latest.node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), ) .await?; Ok(shard_zero_status) @@ -4480,13 +4493,17 @@ impl Service { // if the original attachment location is offline. if let Some(node_id) = shard.intent.get_attached() { if !nodes.get(node_id).unwrap().is_available() { - tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.demote_attached(scheduler, *node_id); } } for node_id in shard.intent.get_secondary().clone() { if !nodes.get(&node_id).unwrap().is_available() { - tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.remove_secondary(scheduler, node_id); } } @@ -4514,7 +4531,9 @@ impl Service { // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have // removed child shards from our in-memory state and database, the reconciliation will implicitly remove // them from the node. - tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + tracing::warn!( + "Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated." + ); continue; } @@ -4537,7 +4556,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 10, Duration::from_secs(5), @@ -4959,7 +4978,10 @@ impl Service { // applies the new stripe size to the children. let mut shard_ident = shard_ident.unwrap(); if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { - return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", + shard_ident.stripe_size + ))); } shard_ident.stripe_size = new_stripe_size; @@ -5137,7 +5159,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let response = client .tenant_shard_split( @@ -5214,8 +5236,11 @@ impl Service { ) .await { - tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", - child_id, child_ps); + tracing::warn!( + "Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, + child_ps + ); failed_notifications.push(child_id); } } @@ -5271,9 +5296,13 @@ impl Service { match shard.policy { PlacementPolicy::Attached(n) => { // If our new attached node was a secondary, it no longer should be. - shard.intent.remove_secondary(scheduler, migrate_req.node_id); + shard + .intent + .remove_secondary(scheduler, migrate_req.node_id); - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + shard + .intent + .set_attached(scheduler, Some(migrate_req.node_id)); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { @@ -5294,7 +5323,7 @@ impl Service { PlacementPolicy::Detached => { return Err(ApiError::BadRequest(anyhow::anyhow!( "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))) + ))); } } @@ -5355,7 +5384,9 @@ impl Service { shard.intent ); } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { - tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary"); + tracing::info!( + "Migrating secondary to {node}: already attached where we were asked to create a secondary" + ); } else { let old_secondaries = shard.intent.get_secondary().clone(); for secondary in old_secondaries { @@ -5463,7 +5494,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let scan_result = client @@ -5868,7 +5899,7 @@ impl Service { return Err(ApiError::InternalServerError(anyhow::anyhow!( "{} attached as primary+secondary on the same node", tid - ))) + ))); } (true, false) => Some(false), (false, true) => Some(true), @@ -5907,8 +5938,10 @@ impl Service { ) .await; + #[derive(PartialEq)] enum RegistrationStatus { - Matched, + UpToDate, + NeedUpdate, Mismatched, New, } @@ -5917,7 +5950,11 @@ impl Service { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { if node.registration_match(®ister_req) { - RegistrationStatus::Matched + if node.need_update(®ister_req) { + RegistrationStatus::NeedUpdate + } else { + RegistrationStatus::UpToDate + } } else { RegistrationStatus::Mismatched } @@ -5927,9 +5964,9 @@ impl Service { }; match registration_status { - RegistrationStatus::Matched => { + RegistrationStatus::UpToDate => { tracing::info!( - "Node {} re-registered with matching address", + "Node {} re-registered with matching address and is up to date", register_req.node_id ); @@ -5947,7 +5984,7 @@ impl Service { "Node is already registered with different address".to_string(), )); } - RegistrationStatus::New => { + RegistrationStatus::New | RegistrationStatus::NeedUpdate => { // fallthrough } } @@ -5976,6 +6013,16 @@ impl Service { )); } + if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "Node {} has no https port, but use_https is enabled", + register_req.node_id + ) + .into(), + )); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -5983,13 +6030,29 @@ impl Service { register_req.node_id, register_req.listen_http_addr, register_req.listen_http_port, + register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, register_req.availability_zone_id.clone(), + self.config.use_https_pageserver_api, ); + let new_node = match new_node { + Ok(new_node) => new_node, + Err(error) => return Err(ApiError::InternalServerError(error)), + }; - // TODO: idempotency if the node already exists in the database - self.persistence.insert_node(&new_node).await?; + match registration_status { + RegistrationStatus::New => self.persistence.insert_node(&new_node).await?, + RegistrationStatus::NeedUpdate => { + self.persistence + .update_node_on_registration( + register_req.node_id, + register_req.listen_https_port, + ) + .await? + } + _ => unreachable!("Other statuses have been processed earlier"), + } let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); @@ -6004,12 +6067,24 @@ impl Service { .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); - tracing::info!( - "Registered pageserver {} ({}), now have {} pageservers", - register_req.node_id, - register_req.availability_zone_id, - locked.nodes.len() - ); + match registration_status { + RegistrationStatus::New => { + tracing::info!( + "Registered pageserver {} ({}), now have {} pageservers", + register_req.node_id, + register_req.availability_zone_id, + locked.nodes.len() + ); + } + RegistrationStatus::NeedUpdate => { + tracing::info!( + "Re-registered and updated node {} ({})", + register_req.node_id, + register_req.availability_zone_id, + ); + } + _ => unreachable!("Other statuses have been processed earlier"), + } Ok(()) } @@ -6027,7 +6102,9 @@ impl Service { if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory - self.persistence.update_node(node_id, scheduling).await?; + self.persistence + .update_node_scheduling_policy(node_id, scheduling) + .await?; } // If we're activating a node, then before setting it active we must reconcile any shard locations @@ -6598,11 +6675,12 @@ impl Service { ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); - match reconcile_needed { + let reconcile_reason = match reconcile_needed { ReconcileNeeded::No => return None, ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), - ReconcileNeeded::Yes => { + ReconcileNeeded::Yes(reason) => { // Fall through to try and acquire units for spawning reconciler + reason } }; @@ -6641,6 +6719,7 @@ impl Service { }; shard.spawn_reconciler( + reconcile_reason, &self.result_tx, nodes, &self.compute_hook, @@ -6765,7 +6844,7 @@ impl Service { // with the frequency of background calls, this acts as an implicit rate limit that runs a small // trickle of optimizations in the background, rather than executing a large number in parallel // when a change occurs. - const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 16; // Synchronous prepare: scan shards for possible scheduling optimizations let candidate_work = self.optimize_all_plan(); @@ -6816,7 +6895,7 @@ impl Service { // How many candidate optimizations we will generate, before evaluating them for readniess: setting // this higher than the execution limit gives us a chance to execute some work even if the first // few optimizations we find are not ready. - const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 64; let mut work = Vec::new(); let mut locked = self.inner.write().unwrap(); @@ -6863,12 +6942,16 @@ impl Service { // Check that maybe_optimizable doesn't disagree with the actual optimization functions. // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't // panic in prod if we hit this, or spend cycles on it in prod. - assert!(shard - .optimize_attachment(scheduler, &schedule_context) - .is_none()); - assert!(shard - .optimize_secondary(scheduler, &schedule_context) - .is_none()); + assert!( + shard + .optimize_attachment(scheduler, &schedule_context) + .is_none() + ); + assert!( + shard + .optimize_secondary(scheduler, &schedule_context) + .is_none() + ); } continue; } @@ -6924,7 +7007,9 @@ impl Service { } Some(node) => { if !node.is_available() { - tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + tracing::info!( + "Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable" + ); } else { // Accumulate optimizations that require fetching secondary status, so that we can execute these // remote API requests concurrently. @@ -6970,7 +7055,9 @@ impl Service { { match secondary_status { Err(e) => { - tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}" + ); } Ok(progress) => { // We require secondary locations to have less than 10GiB of downloads pending before we will use @@ -6983,7 +7070,9 @@ impl Service { || progress.bytes_total - progress.bytes_downloaded > DOWNLOAD_FRESHNESS_THRESHOLD { - tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" + ); #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { @@ -7043,7 +7132,7 @@ impl Service { match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7079,7 +7168,7 @@ impl Service { ) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7089,14 +7178,18 @@ impl Service { { Some(Err(e)) => { tracing::info!( - "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" - ); + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); } None => { - tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"); + tracing::info!( + "Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}" + ); } Some(Ok(progress)) => { - tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"); + tracing::info!( + "Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}" + ); } } } @@ -7134,7 +7227,7 @@ impl Service { let request = request_ref.clone(); client.top_tenant_shards(request.clone()).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 3, Duration::from_secs(5), @@ -7181,7 +7274,9 @@ impl Service { // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't // want to block the background reconcile loop on this. - tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + tracing::info!( + "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" + ); let this = self.clone(); tokio::spawn( @@ -7307,7 +7402,7 @@ impl Service { match node .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, Duration::from_millis(250), diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index aa0ee0df5a..2ff68d7037 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -1,8 +1,6 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, - time::Duration, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; @@ -16,29 +14,80 @@ use super::{Node, Scheduler, Service, TenantShard}; pub struct ChaosInjector { service: Arc, interval: Duration, + chaos_exit_crontab: Option, +} + +fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result { + use chrono::Utc; + let next = cron.upcoming(Utc).next().unwrap(); + let duration = (next - Utc::now()).to_std()?; + Ok(tokio::time::sleep(duration)) +} + +async fn maybe_sleep(sleep: Option) -> Option<()> { + if let Some(sleep) = sleep { + sleep.await; + Some(()) + } else { + None + } } impl ChaosInjector { - pub fn new(service: Arc, interval: Duration) -> Self { - Self { service, interval } + pub fn new( + service: Arc, + interval: Duration, + chaos_exit_crontab: Option, + ) -> Self { + Self { + service, + interval, + chaos_exit_crontab, + } } pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - - loop { - tokio::select! { - _ = interval.tick() => {} - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; + let cron_interval = { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } } + } else { + None } - - self.inject_chaos().await; - - tracing::info!("Chaos iteration..."); + }; + enum ChaosEvent { + ShuffleTenant, + ForceKill, } + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } + } + + tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be @@ -95,6 +144,11 @@ impl ChaosInjector { ); } + async fn force_kill(&mut self) { + tracing::warn!("Injecting chaos: force kill"); + std::process::exit(1); + } + async fn inject_chaos(&mut self) { // Pick some shards to interfere with let batch_size = 128; @@ -120,12 +174,19 @@ impl ChaosInjector { let mut victims = Vec::with_capacity(batch_size); if out_of_home_az.len() >= batch_size { - tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len()); + tracing::info!( + "Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", + out_of_home_az.len() + ); out_of_home_az.shuffle(&mut thread_rng()); victims.extend(out_of_home_az.into_iter().take(batch_size)); } else { - tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len())); + tracing::info!( + "Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", + out_of_home_az.len(), + std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()) + ); victims.extend(out_of_home_az); in_home_az.shuffle(&mut thread_rng()); diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index dd6913e988..c4784e5e36 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -54,17 +54,16 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, str::FromStr}; + use std::collections::BTreeMap; + use std::str::FromStr; use pageserver_api::controller_api::PlacementPolicy; use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - scheduler::test_utils::make_test_nodes, service::Scheduler, - tenant_shard::tests::make_test_tenant_with_id, - }; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; + use crate::service::Scheduler; + use crate::tenant_shard::tests::make_test_tenant_with_id; #[test] fn test_context_iterator() { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 219c0dffe7..34fd244023 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1,50 +1,39 @@ -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; -use crate::{ - metrics::{ - self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, - }, - persistence::TenantShardPersistence, - reconciler::{ReconcileUnits, ReconcilerConfig}, - scheduler::{ - AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, - RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag, - }, - service::ReconcileResultRequest, -}; use futures::future::{self, Either}; use itertools::Itertools; use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; -use pageserver_api::{ - models::{LocationConfig, LocationConfigMode, TenantConfig}, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models::{LocationConfig, LocationConfigMode, TenantConfig}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::{instrument, Instrument}; -use utils::{ - generation::Generation, - id::NodeId, - seqwait::{SeqWait, SeqWaitError}, - shard::ShardCount, - sync::gate::GateGuard, -}; +use tracing::{Instrument, instrument}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::seqwait::{SeqWait, SeqWaitError}; +use utils::shard::ShardCount; +use utils::sync::gate::GateGuard; -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{split_state::SplitState, Persistence}, - reconciler::{ - attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, - }, - scheduler::{ScheduleError, Scheduler}, - service, Sequence, +use crate::compute_hook::ComputeHook; +use crate::metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, }; +use crate::node::Node; +use crate::persistence::split_state::SplitState; +use crate::persistence::{Persistence, TenantShardPersistence}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, Reconciler, ReconcilerConfig, TargetState, + attached_location_conf, secondary_location_conf, +}; +use crate::scheduler::{ + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, ScheduleError, Scheduler, SecondaryShardTag, ShardTag, +}; +use crate::service::ReconcileResultRequest; +use crate::{Sequence, service}; /// Serialization helper fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result @@ -481,7 +470,14 @@ pub(crate) enum ReconcileNeeded { /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] - Yes, + Yes(ReconcileReason), +} + +#[derive(Debug)] +pub(crate) enum ReconcileReason { + ActiveNodesDirty, + UnknownLocation, + PendingComputeNotification, } /// Pending modification to the observed state of a tenant shard. @@ -828,7 +824,9 @@ impl TenantShard { let current_score = current_score.for_optimization(); if candidate_score < current_score { - tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"); + tracing::info!( + "Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})" + ); Some(true) } else { // The candidate node is no better than our current location, so don't migrate @@ -998,7 +996,7 @@ impl TenantShard { // most cases, even if some nodes are offline or have scheduling=pause set. debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this - // logic presumes we are in a mode where we want secondaries to be in non-home AZ + // logic presumes we are in a mode where we want secondaries to be in non-home AZ if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; let is_available = secondary_scores @@ -1022,7 +1020,8 @@ impl TenantShard { } // Fall through: we didn't identify one to remove. This ought to be rare. - tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + tracing::warn!( + "Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", self.intent.get_secondary() ); } else { @@ -1341,12 +1340,18 @@ impl TenantShard { let active_nodes_dirty = self.dirty(pageservers); - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - let do_reconcile = - active_nodes_dirty || dirty_observed || self.pending_compute_notification; + let reconcile_needed = match ( + active_nodes_dirty, + dirty_observed, + self.pending_compute_notification, + ) { + (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty), + (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation), + (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification), + _ => ReconcileNeeded::No, + }; - if !do_reconcile { + if matches!(reconcile_needed, ReconcileNeeded::No) { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } @@ -1389,7 +1394,7 @@ impl TenantShard { } } - ReconcileNeeded::Yes + reconcile_needed } /// Ensure the sequence number is set to a value where waiting for this value will make us wait @@ -1479,6 +1484,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, + reason: ReconcileReason, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, @@ -1538,7 +1544,7 @@ impl TenantShard { let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; - tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})"); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, @@ -1784,8 +1790,8 @@ impl TenantShard { let conf = observed.conf.as_ref()?; match (conf.generation, conf.mode) { - (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { - Some((*node_id, gen)) + (Some(gen_), AttachedMulti | AttachedSingle | AttachedStale) => { + Some((*node_id, gen_)) } _ => None, } @@ -1793,7 +1799,7 @@ impl TenantShard { .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { lhs_gen.cmp(rhs_gen).reverse() }) - .map(|(node_id, gen)| (node_id, Generation::new(gen))) + .map(|(node_id, gen_)| (node_id, Generation::new(gen_))) .collect() } @@ -1825,7 +1831,10 @@ impl TenantShard { (Some(crnt), Some(new)) if crnt_gen > new_gen => { tracing::warn!( "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})", - node_id, loc, crnt, new + node_id, + loc, + crnt, + new ); self.observed @@ -1882,18 +1891,17 @@ impl Drop for TenantShard { #[cfg(test)] pub(crate) mod tests { - use std::{cell::RefCell, rc::Rc}; + use std::cell::RefCell; + use std::rc::Rc; - use pageserver_api::{ - controller_api::NodeAvailability, - shard::{ShardCount, ShardNumber}, - }; - use rand::{rngs::StdRng, SeedableRng}; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::shard::{ShardCount, ShardNumber}; + use rand::SeedableRng; + use rand::rngs::StdRng; use utils::id::TenantId; - use crate::scheduler::test_utils::make_test_nodes; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); @@ -2071,16 +2079,20 @@ pub(crate) mod tests { // In pause mode, schedule() shouldn't do anything tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(!tenant_shard.intent.all_pageservers().is_empty()); tenant_shard.intent.clear(&mut scheduler); @@ -2607,9 +2619,11 @@ pub(crate) mod tests { ); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { - assert!(shard - .schedule(&mut scheduler, &mut schedule_context) - .is_ok()); + assert!( + shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok() + ); } // Initial: attached locations land in the tenant's home AZ. diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 609f3bf009..7f6544b894 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_scrubber" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b42709868b..f0ba632fd4 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,12 +1,19 @@ use std::collections::{HashMap, HashSet}; use std::time::SystemTime; +use futures_util::StreamExt; use itertools::Itertools; +use pageserver::tenant::IndexPart; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; +use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -15,14 +22,7 @@ use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, -}; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries}; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -329,11 +329,11 @@ pub(crate) enum BlobDataParseResult { pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? - Some((layer_filename, gen)) if gen.len() == 8 => { + Some((layer_filename, gen_)) if gen_.len() == 8 => { let layer = layer_filename.parse::()?; - let gen = - Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; - Ok((layer, gen)) + let gen_ = + Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?; + Ok((layer, gen_)) } _ => Ok((name.parse::()?, Generation::none())), } @@ -423,9 +423,9 @@ async fn list_timeline_blobs_impl( tracing::info!("initdb archive preserved {key}"); } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { - Ok((new_layer, gen)) => { - tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); - s3_layers.insert((new_layer, gen)); + Ok((new_layer, gen_)) => { + tracing::debug!("Parsed layer key: {new_layer} {gen_:?}"); + s3_layers.insert((new_layer, gen_)); } Err(e) => { tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}"); @@ -465,7 +465,7 @@ async fn list_timeline_blobs_impl( .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) { - Some((key, gen)) => (Some::(key.to_owned()), gen), + Some((key, gen_)) => (Some::(key.to_owned()), gen_), None => { // Legacy/missing case: one or zero index parts, which did not have a generation (index_part_keys.pop(), Generation::none()) @@ -521,7 +521,7 @@ async fn list_timeline_blobs_impl( }, unused_index_keys: index_part_keys, unknown_keys, - })) + })); } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" @@ -631,7 +631,7 @@ pub(crate) async fn list_tenant_manifests( .map(|(g, obj)| (*g, obj.clone())) .unwrap(); - manifests.retain(|(gen, _obj)| gen != &latest_generation); + manifests.retain(|(gen_, _obj)| gen_ != &latest_generation); let manifest_bytes = match download_object_with_retries(remote_client, &latest_listing_object.key).await { diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index b1dfe3a53f..5cf286c662 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -3,11 +3,9 @@ use std::error::Error as _; use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; - -use reqwest::{header, Client, StatusCode, Url}; +use reqwest::{Client, StatusCode, Url, header}; use serde::Deserialize; use tokio::sync::Semaphore; - use tokio_util::sync::CancellationToken; use utils::backoff; use utils::id::{TenantId, TimelineId}; diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 95d3af1453..efb05fb55e 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -5,10 +5,9 @@ use pageserver::tenant::storage_layer::LayerName; use remote_storage::ListingMode; use serde::{Deserialize, Serialize}; -use crate::{ - checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, - stream_objects_with_retries, BucketConfig, NodeKind, -}; +use crate::checks::parse_layer_object_name; +use crate::metadata_stream::stream_tenants; +use crate::{BucketConfig, NodeKind, init_remote, stream_objects_with_retries}; #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] enum LargeObjectKind { diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index a4e5107e3d..e4f69a1669 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -3,11 +3,9 @@ //! Garbage means S3 objects which are either not referenced by any metadata, //! or are referenced by a control plane tenant/timeline in a deleted state. -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Context; use futures_util::TryStreamExt; @@ -16,13 +14,14 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::TenantId}; +use utils::backoff; +use utils::id::TenantId; +use crate::cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}; use crate::{ - cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, + BucketConfig, ConsoleConfig, MAX_RETRIES, NodeKind, TenantShardTimelineId, TraversingDepth, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -259,14 +258,21 @@ async fn find_garbage_inner( .await?; if let Some(object) = tenant_objects.keys.first() { if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console and contains one object: {}", + object.key + ); } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran" + ); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -295,9 +301,13 @@ async fn find_garbage_inner( } if any_non_initdb { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb" + ); } else { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } @@ -546,7 +556,9 @@ pub async fn purge_garbage( .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) && garbage_list.active_timeline_count == 0 { - anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + anyhow::bail!( + "Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines" + ); } let filtered_items = garbage_list diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 224235098c..34e43fcc0b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -17,15 +17,14 @@ use std::time::{Duration, SystemTime}; use anyhow::Context; use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; use aws_sdk_s3::config::Region; use aws_sdk_s3::error::DisplayErrorContext; -use aws_sdk_s3::Client; - use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use futures::{Stream, StreamExt}; -use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, @@ -38,7 +37,8 @@ use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::{EnvFilter, fmt}; use utils::fs_ext; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -411,10 +411,10 @@ async fn init_remote( let default_prefix = default_prefix_in_bucket(node_kind).to_string(); match &mut storage_config.0.storage { - RemoteStorageKind::AwsS3(ref mut config) => { + RemoteStorageKind::AwsS3(config) => { config.prefix_in_bucket.get_or_insert(default_prefix); } - RemoteStorageKind::AzureContainer(ref mut config) => { + RemoteStorageKind::AzureContainer(config) => { config.prefix_in_container.get_or_insert(default_prefix); } RemoteStorageKind::LocalFs { .. } => (), diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fa6ee90b66..fb2ab02565 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,24 +1,20 @@ -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use camino::Utf8PathBuf; +use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; use reqwest::{Method, Url}; use storage_controller_client::control_api; -use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; +use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; -use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; +use storage_scrubber::scan_safekeeper_metadata::{DatabaseOrList, scan_safekeeper_metadata}; use storage_scrubber::tenant_snapshot::SnapshotDownloader; -use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ - init_logging, pageserver_physical_gc::pageserver_physical_gc, - scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, - TraversingDepth, + BucketConfig, ConsoleConfig, ControllerClientConfig, NodeKind, TraversingDepth, + find_large_objects, init_logging, }; - -use clap::{Parser, Subcommand}; use utils::id::TenantId; - use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -173,15 +169,23 @@ async fn main() -> anyhow::Result<()> { if let NodeKind::Safekeeper = node_kind { let db_or_list = match (timeline_lsns, dump_db_connstr) { (Some(timeline_lsns), _) => { - let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + let timeline_lsns = serde_json::from_str(&timeline_lsns) + .context("parsing timeline_lsns")?; DatabaseOrList::List(timeline_lsns) } (None, Some(dump_db_connstr)) => { - let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let dump_db_table = dump_db_table + .ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); - DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + DatabaseOrList::Database { + tenant_ids, + connstr: dump_db_connstr, + table: dump_db_table, + } } - (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + (None, None) => anyhow::bail!( + "neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`" + ), }; let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { @@ -371,7 +375,9 @@ pub async fn scan_pageserver_metadata_cmd( exit_code: bool, ) -> anyhow::Result<()> { if controller_client.is_none() && post_to_storcon { - return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + return Err(anyhow!( + "Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run" + )); } match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await { Err(e) => { diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 47447d681c..af2407856d 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,17 +1,17 @@ use std::str::FromStr; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use async_stream::{stream, try_stream}; use futures::StreamExt; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; +use utils::id::{TenantId, TimelineId}; use crate::{ - list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, - TenantShardTimelineId, + RootTarget, S3Target, TenantShardTimelineId, list_objects_with_retries, + stream_objects_with_retries, }; -use pageserver_api::shard::TenantShardId; -use utils::id::{TenantId, TimelineId}; /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes pub fn stream_tenants<'a>( diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 063c6bcfb9..c956b1abbc 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,22 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{ - list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, - RemoteTenantManifestInfo, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use async_stream::try_stream; use futures::future::Either; use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest; use pageserver::tenant::remote_timeline_client::{ parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, }; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -25,11 +19,18 @@ use reqwest::Method; use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; +use crate::checks::{ + BlobDataParseResult, ListTenantManifestResult, RemoteTenantManifestInfo, list_tenant_manifests, + list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, MAX_RETRIES, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index a31fb5b242..ba75f25984 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,22 @@ use std::collections::{HashMap, HashSet}; -use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, - RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::id::TenantId; use utils::shard::ShardCount; +use crate::checks::{ + BlobDataParseResult, RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 0a4d4266a0..f10d758097 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,23 +1,24 @@ -use std::{collections::HashSet, str::FromStr, sync::Arc}; +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use postgres_ffi::{XLogFileName, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName}; use remote_storage::GenericRemoteStorage; use rustls::crypto::ring; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use crate::cloud_admin_api::CloudAdminApiClient; +use crate::metadata_stream::stream_listing; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 60e79fb859..e17409c20e 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -1,25 +1,26 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; -use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; -use crate::{ - download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, - TenantShardTimelineId, -}; use anyhow::Context; use async_stream::stream; use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, S3Config}; use utils::generation::Generation; use utils::id::TenantId; +use crate::checks::{BlobDataParseResult, RemoteTimelineBlobData, list_timeline_blobs}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, download_object_to_file_s3, + init_remote, init_remote_s3, +}; + pub struct SnapshotDownloader { s3_client: Arc, s3_root: RootTarget, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 58c5dbfd29..5159ad4e3b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1167,15 +1167,15 @@ class NeonEnv: "max_batch_size": 32, } - # Concurrent IO (https://github.com/neondatabase/neon/issues/9378): - # enable concurrent IO by default in tests and benchmarks. - # Compat tests are exempt because old versions fail to parse the new config. - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if config.test_may_use_compatibility_snapshot_binaries: log.info( - "Forcing use of binary-built-in default to avoid forward-compatibility related test failures" + "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" ) - get_vectored_concurrent_io = None + else: + # Look for gaps in WAL received from safekeepeers + ps_cfg["validate_wal_contiguity"] = True + + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { "mode": self.pageserver_get_vectored_concurrent_io, @@ -1630,6 +1630,7 @@ def neon_env_builder( class PageserverPort: pg: int http: int + https: int | None = None class LogUtils: @@ -1886,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_https_port": node.service_port.https, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, "availability_zone_id": node.az_id, @@ -4521,33 +4523,6 @@ class Safekeeper(LogUtils): for na in not_allowed: assert not self.log_contains(na) - def append_logical_message( - self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any] - ) -> dict[str, Any]: - """ - Send JSON_CTRL query to append LogicalMessage to WAL and modify - safekeeper state. It will construct LogicalMessage from provided - prefix and message, and then will write it to WAL. - """ - - # "replication=0" hacks psycopg not to send additional queries - # on startup, see https://github.com/psycopg/psycopg2/pull/482 - token = self.env.auth_keys.generate_tenant_token(tenant_id) - connstr = f"host=localhost port={self.port.pg} password={token} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" - - with closing(psycopg2.connect(connstr)) as conn: - # server doesn't support transactions - conn.autocommit = True - with conn.cursor() as cur: - request_json = json.dumps(request) - log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") - cur.execute("JSON_CTRL " + request_json) - all = cur.fetchall() - log.info(f"JSON_CTRL response: {all[0][0]}") - res = json.loads(all[0][0]) - assert isinstance(res, dict) - return res - def http_client( self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 493ce7334e..7038d87aba 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -273,10 +273,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_exclude( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> dict[str, Any]: + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", + data=to.to_json(), + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def membership_switch( self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration ) -> TimelineMembershipSwitchResponse: - res = self.post( + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", data=to.to_json(), ) diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 0ee0efe8b9..2570c55f6c 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -2,8 +2,10 @@ import os from pathlib import Path import pytest +from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder def get_num_relations(default: int = 1000) -> list[int]: @@ -64,3 +66,52 @@ def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): env.pg_bin.run_capture( ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] ) + + +def test_perf_simple_many_relations_reldir_v2( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Test creating many relations in a single database. + """ + env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": "true"}) + ep = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=1000MB", + "max_locks_per_transaction=16384", + ], + ) + + n = 100000 + step = 5000 + # Create many relations + log.info(f"Creating {n} relations...") + begin = 0 + with zenbenchmark.record_duration("create_first_relation"): + ep.safe_psql("CREATE TABLE IF NOT EXISTS table_begin (id SERIAL PRIMARY KEY, data TEXT)") + with zenbenchmark.record_duration("create_many_relations"): + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break + with zenbenchmark.record_duration("create_last_relation"): + ep.safe_psql(f"CREATE TABLE IF NOT EXISTS table_{begin} (id SERIAL PRIMARY KEY, data TEXT)") diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c0c9537421..bfc5cb174e 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -7,6 +7,7 @@ import psycopg2.errors import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) @@ -80,3 +81,193 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() + + +def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder): + """ + Test timeouts in waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=["autovacuum = off"], + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Print the backend PID so that it can be compared with the logs easily + cur.execute("SELECT pg_backend_pid()") + row = cur.fetchone() + assert row is not None + log.info(f"running test workload in backend PID {row[0]}") + + def run_workload(duration: float): + end_time = time.time() + duration + times_executed = 0 + while time.time() < end_time: + if random.random() < 0.5: + cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + else: + cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + cur.fetchall() + times_executed += 1 + log.info(f"Workload executed {times_executed} times") + assert times_executed > 0 + + ## Test short connection hiccups + ## + ## This is to exercise the logging timeout. + log.info("running workload with log timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(20) + + # check that the message was logged + assert endpoint.log_contains("no response received from pageserver for .* s, still waiting") + assert endpoint.log_contains("received response from pageserver after .* s") + + ## Test connections that are hung for longer + ## + ## This exercises the disconnect timeout. We'll disconnect and + ## reconnect after 500 ms. + log.info("running workload with disconnect timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '250ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(15) + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() + + +def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder): + """ + Test statement_timeout while waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + + # Make sure the shared_buffers and LFC are tiny, to ensure the queries + # hit the storage. Disable autovacuum to make the test more deterministic. + config_lines = [ + "shared_buffers='512kB'", + "autovacuum = off", + ] + if USE_LFC: + config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"] + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=config_lines, + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Disable parallel query. Parallel workers open their own pageserver connections, + # which messes up the test logic. + cur.execute("SET max_parallel_workers_per_gather=0") + cur.execute("SET effective_io_concurrency=0") + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + ## Run a query until the compute->pageserver connection hits the failpoint and + ## get stuck. This tests that the statement_timeout is obeyed while waiting on a + ## GetPage request. + log.info("running workload with statement_timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'") + cur.execute("SET statement_timeout='10s'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)")) + + start_time = time.time() + with pytest.raises(psycopg2.errors.QueryCanceled): + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement timeout reached") + end_time = time.time() + # Verify that the statement_timeout canceled the query before + # neon.pageserver_response_disconnect_timeout expired + assert end_time - start_time < 40 + times_canceled = 1 + + # Should not have disconnected yet + assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # Clear the failpoint. This doesn't affect the connection that already hit it. It + # will keep waiting. But subsequent connections will work normally. + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off")) + + # If we keep retrying, we should eventually succeed. (This tests that the + # neon.pageserver_response_disconnect_timeout is not reset on query + # cancellation.) + while times_canceled < 10: + try: + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement succeeded") + break + except psycopg2.errors.QueryCanceled: + log.info("Statement timed out, retrying") + times_canceled += 1 + assert times_canceled > 1 and times_canceled < 10 + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 124e62999a..d49686b57c 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*failed to load metadata.*", ".*load failed.*load local timeline.*", ".*: layer load failed, assuming permanent failure:.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) @@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match="failed to get checkpoint bytes") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index c091cd0869..0df88e14c2 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -319,8 +319,12 @@ def test_pageserver_gc_compaction_idempotent( }, ) wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + # Ensure all data are uploaded so that the duplicated layer gets into index_part.json + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_flushed=True) if compaction_mode == "after_restart": env.pageserver.restart(True) + workload.validate(env.pageserver.id) ps_http.timeline_gc( tenant_id, timeline_id, None ) # Force refresh gc info to have gc_cutoff generated @@ -335,6 +339,7 @@ def test_pageserver_gc_compaction_idempotent( "sub_compaction_max_job_size_mb": 16, }, ) + workload.validate(env.pageserver.id) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) @@ -466,6 +471,59 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): ps_http.timeline_gc(tenant_id, timeline_id, None) +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 * 16}", + "lsn_lease_length": "0s", + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": "16", + "gc_compaction_ratio_percent": "50", + # Do not generate image layers with create_image_layers + "image_layer_creation_check_threshold": "100", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=True) + wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 99d41e410a..b360162dc1 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -501,19 +501,31 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): """ Test that the compute_installed_extensions properly reports accurate results. Important to note that currently this metric is only gathered on - compute start. + compute start. We install the neon extension into a database other than + postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during + Postgres startup in the postgres database, creating a race condition. """ + DB_NAME = "test" + env = neon_simple_env endpoint = env.endpoints.create_start("main") + endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}") + + # The metric is only gathered on compute start, so restart to check that + # plpgsql is now in 3 databases, instead of its regular 2, template1 and + # postgres. + endpoint.stop() + endpoint.start() client = endpoint.http_client() def __has_plpgsql(samples: list[Sample]) -> bool: """ - Check that plpgsql is installed in the template1 and postgres databases + Check that plpgsql is installed in the template1, postgres, and test + databases """ - return len(samples) == 1 and samples[0].value == 2 + return len(samples) == 1 and samples[0].value == 3 wait_until( collect_metric( @@ -525,8 +537,8 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): name="compute_installed_extensions", ) - # Install the neon extension, so we can check for it on the restart - endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'") + # Install the neon extension, so we can check for it on the restart. + endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME) # The metric is only gathered on compute start, so restart to check if the # neon extension will now be there. diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py new file mode 100644 index 0000000000..dd422d996e --- /dev/null +++ b/test_runner/regress/test_lfc_prefetch.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC + + +@pytest.mark.timeout(600) +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prefetch(neon_simple_env: NeonEnv): + """ + Test resizing the Local File Cache + """ + env = neon_simple_env + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "effective_io_concurrency=100", + "shared_buffers=1MB", + "enable_bitmapscan=off", + "enable_seqscan=off", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute("create table t(pk integer, sk integer, filler text default repeat('x',200))") + cur.execute("set statement_timeout=0") + cur.execute("select setseed(0.5)") + cur.execute("insert into t values (generate_series(1,1000000),random()*1000000)") + cur.execute("create index on t(sk)") + cur.execute("vacuum t") + + # reset LFC + cur.execute("alter system set neon.file_cache_size_limit=0") + cur.execute("select pg_reload_conf()") + time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='1GB'") + cur.execute("select pg_reload_conf()") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s1" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 200000 and 300000 limit 100) s2" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 300000 and 400000 limit 100) s3" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s4" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # if prefetch requests are not stored in LFC, we continue to sent unused prefetch request tyo PS + assert prefetch_expired > 0 + + cur.execute("set neon.store_prefetch_result_in_lfc=on") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s5" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 600000 and 700000 limit 100) s6" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 700000 and 800000 limit 100) s7" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s8" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # No redundant prefethc requrests if prefetch results are stored in LFC + assert prefetch_expired == 0 diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index ae2d171058..c8458b963e 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient -def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): +def check_tenant( + env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int +): tenant_id, timeline_id = env.create_tenant() - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines) # we rely upon autocommit after each statement res_1 = endpoint.safe_psql_many( queries=[ @@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): @pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) -def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_normal_work( + neon_env_builder: NeonEnvBuilder, + num_timelines: int, + num_safekeepers: int, + safekeeper_proto_version: int, +): """ Basic test: * create new tenant with a timeline @@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s pageserver_http = env.pageserver.http_client() for _ in range(num_timelines): - check_tenant(env, pageserver_http) + check_tenant(env, pageserver_http, safekeeper_proto_version) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 602d493ae6..a9b897b741 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -8,9 +8,10 @@ from pathlib import Path from typing import TYPE_CHECKING import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, NeonEnvBuilder, NeonPageserver, StorageControllerMigrationConfig, @@ -927,8 +928,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.write_rows(128, upload=True) workload.write_rows(128, upload=True) workload.write_rows(128, upload=True) + + child_timeline_id = env.create_branch( + "foo", tenant_id, ancestor_branch_name=DEFAULT_BRANCH_NAME + ) + workload.write_rows(128, upload=True) - workload.stop() # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 @@ -937,9 +942,19 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): for ps in env.pageservers: ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + def timeline_heatmap(tlid): + assert env.pageserver_remote_storage is not None + + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + for htl in heatmap["timelines"]: + if htl["timeline_id"] == str(tlid): + return htl + + raise RuntimeError(f"No heatmap for timeline: {tlid}") + # Upload a heatmap, so that secondaries have something to download ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_before_migration = timeline_heatmap(timeline_id) # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. # However, it pulls the heatmap, which will be important later. @@ -971,17 +986,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id ps_secondary.http_client().tenant_heatmap_upload(tenant_id) - heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0 + assert len(heatmap_before_migration["layers"]) > 0 - # The new layer map should contain all the layers in the pre-migration one - # and a new in memory layer - after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"]) - assert ( - len(heatmap_before_migration["timelines"][0]["layers"]) + 1 - == after_migration_heatmap_layers_count - ) + after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) + assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") @@ -989,10 +999,71 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - def all_layers_downloaded(): + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + + def all_layers_downloaded(expected_layer_count: int): local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") - assert local_layers_count == after_migration_heatmap_layers_count + assert local_layers_count >= expected_layer_count - wait_until(all_layers_downloaded) + wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + before = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + workload.validate() + after = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + + workload.stop() + assert before == after + + def check_archival_state(state: TimelineArchivalState, tline): + timelines = ( + timeline["timeline_id"] + for timeline in ps_secondary.http_client().timeline_list(tenant_id=tenant_id) + ) + + if state == TimelineArchivalState.ARCHIVED: + assert str(tline) not in timelines + elif state == TimelineArchivalState.UNARCHIVED: + assert str(tline) in timelines + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.ARCHIVED + ) + ps_secondary.http_client().timeline_offload(tenant_id, child_timeline_id) + wait_until(lambda: check_archival_state(TimelineArchivalState.ARCHIVED, child_timeline_id)) + + ps_secondary.http_client().evict_all_layers(tenant_id, timeline_id) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + assert len(timeline_heatmap(timeline_id)["layers"]) == 0 + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.UNARCHIVED + ) + wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") + log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") + + expected_locally = len(timeline_heatmap(timeline_id)["layers"]) + assert expected_locally > 0 + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + wait_until(lambda: all_layers_downloaded(expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 411888efbc..6a76ad5ca8 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -332,8 +332,10 @@ def test_sql_regress( @skip_in_debug_build("only run with release build") +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_tx_abort_with_many_relations( neon_env_builder: NeonEnvBuilder, + reldir_type: str, ): """ This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres @@ -342,7 +344,11 @@ def test_tx_abort_with_many_relations( Reproducer for https://github.com/neondatabase/neon/issues/9505 """ - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false", + } + ) ep = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, @@ -354,48 +360,65 @@ def test_tx_abort_with_many_relations( # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - n = 4000 + if reldir_type == "v1": + n = 4000 + step = 4000 + else: + n = 20000 + step = 5000 def create(): # Create many relations log.info(f"Creating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; - END LOOP; - END $$; - """, - "COMMIT", - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break def truncate(): # Truncate relations, then roll back the transaction containing the truncations log.info(f"Truncating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'TRUNCATE ' || table_name ; - END LOOP; - END $$; - """, - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'TRUNCATE ' || table_name ; + END LOOP; + END $$; + """, + ] + ) + begin = end + if begin >= n: + break def rollback_and_wait(): log.info(f"Rolling back after truncating {n} relations...") diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py new file mode 100644 index 0000000000..bf9b982e14 --- /dev/null +++ b/test_runner/regress/test_pgstat.py @@ -0,0 +1,83 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion + + +# +# Test that pgstat statistic is preserved across sessions +# +def test_pgstat(neon_simple_env: NeonEnv): + env = neon_simple_env + if env.pg_version == PgVersion.V14: + pytest.skip("PG14 doesn't support pgstat statistic persistence") + + n = 10000 + endpoint = env.endpoints.create_start( + "main", config_lines=["neon.pgstat_file_size_limit=100kB", "autovacuum=off"] + ) + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("create table t(x integer)") + cur.execute(f"insert into t values (generate_series(1,{n}))") + cur.execute("vacuum analyze t") + cur.execute("select sum(x) from t") + cur.execute("update t set x=x+1") + + cur.execute("select pg_stat_force_next_flush()") + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + cur.execute("update t set x=x+1") + + # stop without checkpoint + endpoint.stop(mode="immediate") + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information should be discarded in case of abnormal termination + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) + + cur.execute("select sum(x) from t") + + # create more relations to increase size of statistics + for i in range(1, 1000): + cur.execute(f"create table t{i}(pk integer primary key)") + + cur.execute("select pg_stat_force_next_flush()") + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information is not restored because its size exeeds 100k threshold + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 1d95312140..d5acc257b2 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -182,6 +182,13 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) time.sleep(1) assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + # Exercise live migration of a tenant back to the original pageserver + migrate_tenant = env.pageservers[1].http_client().tenant_list_locations()["tenant_shards"][0][0] + env.storage_controller.tenant_shard_migrate( + TenantShardId.parse(migrate_tenant), env.pageservers[0].id + ) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 1 + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) before_restart = env.pageservers[1].http_client().tenant_list_locations() env.pageservers[1].stop() @@ -2139,8 +2146,9 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) @pytest.mark.parametrize("num_azs", [1, 2]) -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int): +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int, combination): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before @@ -3764,3 +3772,96 @@ def test_storage_controller_node_flap_detach_race( assert len(locs) == 1, f"{shard} has {len(locs)} attached locations" wait_until(validate_locations, timeout=10) + + +def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): + """ + Check that storage controller handles node_register requests with updated fields correctly. + 1. Run storage controller and register 1 pageserver without https port. + 2. Register the same pageserver with https port. Check that port has been updated. + 3. Restart the storage controller. Check that https port is persistent. + 4. Register the same pageserver without https port again (rollback). Check that port has been removed. + """ + neon_env_builder.num_pageservers = 1 + env = neon_env_builder.init_configs() + + env.storage_controller.start() + env.storage_controller.wait_until_ready() + + pageserver = env.pageservers[0] + + # Step 1. Register pageserver without https port. + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + # Step 2. Register pageserver with https port. + pageserver.service_port.https = 1234 + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 3. Restart storage controller. + env.storage_controller.stop() + env.storage_controller.start() + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 4. Register pageserver with no https port again. + pageserver.service_port.https = None + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + +def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder): + """ + Validate that a storage controller restart with no shards in a transient state + performs zero reconciliations at start-up. Implicitly, this means that the location + configs returned by the pageserver are identical to the persisted state in the + storage controller database. + """ + neon_env_builder.num_pageservers = 1 + neon_env_builder.storage_controller_config = { + "start_as_candidate": False, + } + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"} + ) + + env.storage_controller.reconcile_until_idle() + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_before_restart != 0 + + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.reconcile_until_idle() + + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_after_restart == 0 diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 849d4f024d..6175643389 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -1,9 +1,10 @@ from __future__ import annotations +import threading import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync from fixtures.utils import query_scalar, wait_until @@ -239,3 +240,173 @@ def test_subscriber_branching(neon_simple_env: NeonEnv): res = scur_postgres.fetchall() assert len(res) == 1 assert str(sub_child_2_timeline_id) == res[0][0] + + +def test_multiple_subscription_branching(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases + """ + env = neon_simple_env + + NUMBER_OF_DBS = 5 + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_replication_slots = 10", + "max_logical_replication_workers=10", + "max_worker_processes=10", + ], + ) + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + for i in range(NUMBER_OF_DBS): + TEST_DB_NAMES.append( + { + "name": f"db{i}", + "owner": "cloud_admin", + } + ) + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + + # create table, replication and subscription for each of the databases + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + publisher_cursor.execute(f"CREATE TABLE t{i}(a int)") + publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}") + publisher_cursor.execute( + f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');" + ) + publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})") + + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"CREATE TABLE t{i}(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i} WITH (create_slot = false) " + ) + + # wait for the subscription to be active + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # Check that replication is working + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") + + def start_publisher_workload(table_num: int, duration: int): + start = time.time() + with endpoint.cursor(dbname="publisher_db") as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)") + + LOAD_DURATION = 5 + threads = [ + threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION)) + for i in range(NUMBER_OF_DBS) + ] + + for thread in threads: + thread.start() + + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="main", + ancestor_start_lsn=last_insert_lsn, + ) + + sub_child_1 = env.endpoints.create("subscriber_child_1") + + sub_child_1.respec( + skip_pg_catalog_updates=False, + reconfigure_concurrency=5, + drop_subscriptions_before_start=True, + cluster={ + "databases": TEST_DB_NAMES, + "roles": [], + }, + ) + + sub_child_1.start() + + # ensure that subscription deletion happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + log.info(f"res = {res}") + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # ensure that there are no subscriptions in the databases + for i in range(NUMBER_OF_DBS): + with sub_child_1.cursor(dbname=f"db{i}") as cursor: + cursor.execute("SELECT * FROM pg_catalog.pg_subscription") + res = cursor.fetchall() + assert len(res) == 0 + + # ensure that there are no unexpected rows in the tables + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + for thread in threads: + thread.join() + + # ensure that logical replication is still working in main endpoint + # wait for it to catch up + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # verify that the data is the same in publisher and subscriber tables + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + publisher_cursor.execute(f"SELECT count(*) FROM t{i}") + cursor.execute(f"SELECT count(*) FROM t{i}") + pub_res = publisher_cursor.fetchone() + sub_res = cursor.fetchone() + log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}") + assert pub_res == sub_res diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 2706ddf2f0..c17840d31c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -823,6 +823,8 @@ def test_timeline_retain_lsn( [ ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*", ".*page_service_conn_main.*could not find data for key.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) if offload_child is None or "no-restart" not in offload_child: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c5045fe4a4..0a05189bfb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -811,60 +811,6 @@ class ProposerPostgres(PgProtocol): self.pg_bin.run(args) -# insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor, -): - # We don't really need the full environment for this test, just the - # safekeepers would be enough. - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - tenant_id = TenantId.generate() - timeline_id = TimelineId.generate() - - # write config for proposer - pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres( - pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() - ) - pg.create_dir_config(env.get_safekeeper_connstrs()) - - # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = Lsn("0/16B9188") - begin_lsn = epoch_start_lsn - - # append and commit WAL - lsn_after_append = [] - for i in range(3): - res = env.safekeepers[i].append_logical_message( - tenant_id, - timeline_id, - { - "lm_prefix": "prefix", - "lm_message": "message", - "set_commit_lsn": True, - "send_proposer_elected": True, - "term": 2, - "begin_lsn": int(begin_lsn), - "epoch_start_lsn": int(epoch_start_lsn), - "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(env.pg_version) * 10000, - }, - ) - lsn = Lsn(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn) - log.info(f"safekeeper[{i}] lsn after append: {lsn}") - - # run sync safekeepers - lsn_after_sync = pg.sync_safekeepers() - log.info(f"lsn after sync = {lsn_after_sync}") - - assert all(lsn_after_sync == lsn for lsn in lsn_after_append) - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled @@ -1740,7 +1686,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): +def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() @@ -2269,13 +2215,21 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() + # These are expected after timeline deletion on safekeepers. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline sk = env.safekeepers[0] http_cli = sk.http_client() - sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_1 = SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. @@ -2303,19 +2257,28 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): log.info(f"conf after restart: {after_restart}") assert after_restart.generation == 4 - # Switch into disjoint conf. - non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + # Switch into non joint conf of which sk is not a member, must fail. + non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) + + # Switch into good non joint conf. + non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 - assert resp.current_conf.generation == 5 + assert resp.current_conf.generation == 6 - # Switch request to lower conf should be ignored. - lower_conf = Configuration(generation=3, members=[], new_members=None) - resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) - log.info(f"lower switch resp: {resp}") - assert resp.previous_conf.generation == 5 - assert resp.current_conf.generation == 5 + # Switch request to lower conf should be rejected. + lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + + # Now, exclude sk from the membership, timeline should be deleted. + excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.timeline_status(tenant_id, timeline_id) # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 936c774657..56539a0a08 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -539,13 +539,16 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) -async def run_wal_truncation(env: NeonEnv): +async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline (sk1, sk2, sk3) = env.safekeepers - ep = env.endpoints.create_start("main") + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + ep = env.endpoints.create_start("main", config_lines=config_lines) ep.safe_psql("create table t (key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -572,6 +575,7 @@ async def run_wal_truncation(env: NeonEnv): sk2.start() ep = env.endpoints.create_start( "main", + config_lines=config_lines, ) ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") @@ -590,11 +594,13 @@ async def run_wal_truncation(env: NeonEnv): # Simple deterministic test creating tail of WAL on safekeeper which is # truncated when majority without this sk elects walproposer starting earlier. -def test_wal_truncation(neon_env_builder: NeonEnvBuilder): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - asyncio.run(run_wal_truncation(env)) + asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) async def run_segment_init_failure(env: NeonEnv): diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 81e2eef061..9b118b1cff 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4 +Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9422247c58..799e7a08dd 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34 +Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index a8fea8b4be..517b8dc244 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8 +Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e diff --git a/vendor/revisions.json b/vendor/revisions.json index 72d97d7f6a..8dde46a01e 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,15 +1,15 @@ { "v17": [ "17.4", - "a8fea8b4be43039f0782347c88a9b9b25f50c9d8" + "517b8dc244abf3e56f0089849e464af76f70b94e" ], "v16": [ "16.8", - "9422247c582e7c1a08a4855d04af0874f8df2f34" + "799e7a08dd171aa06a7395dd326f4243aaeb9f93" ], "v15": [ "15.12", - "81e2eef0616c65c2233c75b06f25766ae4c080c4" + "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8" ], "v14": [ "14.17",