diff --git a/.dockerignore b/.dockerignore index 7ead48db7c..9fafc2e4ba 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,6 +14,7 @@ !compute/ !compute_tools/ !control_plane/ +!docker-compose/ext-src !libs/ !pageserver/ !pgxn/ diff --git a/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py similarity index 98% rename from scripts/generate_image_maps.py rename to .github/scripts/generate_image_maps.py index 915eb33673..39ece5b38f 100644 --- a/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -27,6 +27,7 @@ components = { registries = { "dev": [ "docker.io/neondatabase", + "ghcr.io/neondatabase", f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq new file mode 100644 index 0000000000..b0b00bce18 --- /dev/null +++ b/.github/scripts/previous-releases.jq @@ -0,0 +1,25 @@ +# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input, +# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases. +# Extract only the `tag_name` field from each release object +[ .[].tag_name ] + +# Transform each tag name into a structured object using regex capture +| reduce map( + capture("^(?release(-(?proxy|compute))?-(?\\d+))$") + | { + component: (.component // "storage"), # Default to "storage" if no component is specified + version: (.version | tonumber), # Convert the version number to an integer + full: .full # Store the full tag name for final output + } + )[] as $entry # Loop over the transformed list + +# Accumulate the latest (highest-numbered) version for each component +({}; + .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) + +# Convert the resulting object into an array of formatted strings +| to_entries +| map("\(.key)=\(.value.full)") + +# Output each string separately +| .[] diff --git a/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py similarity index 100% rename from scripts/push_with_image_map.py rename to .github/scripts/push_with_image_map.py diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3740e6dc9c..30fde127b0 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -337,7 +337,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }} + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml new file mode 100644 index 0000000000..9454533fbb --- /dev/null +++ b/.github/workflows/_meta.yml @@ -0,0 +1,103 @@ +name: Generate run metadata +on: + workflow_call: + inputs: + github-event-name: + type: string + required: true + outputs: + build-tag: + description: "Tag for the current workflow run" + value: ${{ jobs.tags.outputs.build-tag }} + previous-storage-release: + description: "Tag of the last storage release" + value: ${{ jobs.tags.outputs.storage }} + previous-proxy-release: + description: "Tag of the last proxy release" + value: ${{ jobs.tags.outputs.proxy }} + previous-compute-release: + description: "Tag of the last compute release" + value: ${{ jobs.tags.outputs.compute }} + run-kind: + description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`" + value: ${{ jobs.tags.outputs.run-kind }} + +permissions: {} + +jobs: + tags: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + compute: ${{ steps.previous-releases.outputs.compute }} + proxy: ${{ steps.previous-releases.outputs.proxy }} + storage: ${{ steps.previous-releases.outputs.storage }} + run-kind: ${{ steps.run-kind.outputs.run-kind }} + permissions: + contents: read + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get run kind + id: run-kind + env: + RUN_KIND: >- + ${{ + false + || (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main' + || (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr' + || (inputs.github-event-name == 'pull_request') && 'pr' + || 'unknown' + }} + run: | + echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT + + - name: Get build tag + id: build-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + RUN_KIND: ${{ steps.run-kind.outputs.run-kind }} + run: | + case $RUN_KIND in + push-main) + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + storage-release) + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + proxy-release) + echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + compute-release) + echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + *) + echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" + exit 1 + esac + + - name: Get the previous release-tags + id: previous-releases + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases" \ + | jq -f .github/scripts/previous-releases.jq -r \ + | tee -a "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index c938f62ad5..2dab665f40 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -11,8 +11,12 @@ on: description: AWS region to log in to. Required when pushing to ECR. required: false type: string - aws-account-ids: - description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR. + aws-account-id: + description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + aws-role-to-assume: + description: AWS role to assume to for pushing to ECR. Required when pushing to ECR. required: false type: string azure-client-id: @@ -31,16 +35,6 @@ on: description: ACR registry name. Required when pushing to ACR. required: false type: string - secrets: - docker-hub-username: - description: Docker Hub username. Required when pushing to Docker Hub. - required: false - docker-hub-password: - description: Docker Hub password. Required when pushing to Docker Hub. - required: false - aws-role-to-assume: - description: AWS role to assume. Required when pushing to ECR. - required: false permissions: {} @@ -53,10 +47,11 @@ jobs: runs-on: ubuntu-22.04 permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/push_with_image_map.py + sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false - name: Print image-map @@ -67,14 +62,14 @@ jobs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: "${{ inputs.aws-region }}" - role-to-assume: "${{ secrets.aws-role-to-assume }}" + role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" role-duration-seconds: 3600 - name: Login to ECR if: contains(inputs.image-map, 'amazonaws.com/') uses: aws-actions/amazon-ecr-login@v2 with: - registries: "${{ inputs.aws-account-ids }}" + registries: "${{ inputs.aws-account-id }}" - name: Configure Azure credentials if: contains(inputs.image-map, 'azurecr.io/') @@ -89,13 +84,21 @@ jobs: run: | az acr login --name=${{ inputs.acr-registry-name }} + - name: Login to GHCR + if: contains(inputs.image-map, 'ghcr.io/') + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub uses: docker/login-action@v3 with: - username: ${{ secrets.docker-hub-username }} - password: ${{ secrets.docker-hub-password }} + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries - run: python scripts/push_with_image_map.py + run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index b36ac46f35..ffb6c65af9 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -140,6 +140,7 @@ jobs: --ignore test_runner/performance/test_logical_replication.py --ignore test_runner/performance/test_physical_replication.py --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py + --ignore test_runner/performance/test_cumulative_statistics_persistence.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -171,6 +172,61 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + cumstats-test: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 17 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Verify that cumulative statistics are preserved + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_cumulative_statistics_persistence.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 3600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: @@ -398,6 +454,9 @@ jobs: runs-on: ${{ matrix.runner }} container: image: ${{ matrix.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init # Increase timeout to 8h, default timeout is 6h diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8f3392ceea..fb6da2f173 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -65,38 +65,11 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} filters: .github/file-filters.yaml - tag: + meta: needs: [ check-permissions ] - runs-on: [ self-hosted, small ] - container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - run: | - echo run:$GITHUB_RUN_ID - echo ref:$GITHUB_REF_NAME - echo rev:$(git rev-list --count HEAD) - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" - echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT - fi - shell: bash - id: build-tag + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ github.event_name }} build-build-tools-image: needs: [ check-permissions ] @@ -199,7 +172,7 @@ jobs: secrets: inherit build-and-test-locally: - needs: [ tag, build-build-tools-image ] + needs: [ meta, build-build-tools-image ] strategy: fail-fast: false matrix: @@ -213,7 +186,7 @@ jobs: with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - build-tag: ${{ needs.tag.outputs.build-tag }} + build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. @@ -497,13 +470,24 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] + # Depends on jobs that can get skipped + if: >- + ${{ + ( + !github.event.pull_request.draft + || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + || contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) + ) && !failure() && !cancelled() + }} + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml + with: + github-event-name: ${{ github.event_name }} secrets: inherit neon-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] @@ -539,7 +523,7 @@ jobs: build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm provenance: false @@ -549,10 +533,11 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: - needs: [ neon-image-arch, tag ] + needs: [ neon-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -567,13 +552,14 @@ jobs: - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -631,7 +617,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -641,7 +627,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -651,7 +637,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -661,10 +647,11 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: - needs: [ compute-node-image-arch, tag ] + needs: [ compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -692,21 +679,22 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 vm-compute-node-image: - needs: [ check-permissions, tag, compute-node-image ] + needs: [ check-permissions, meta, compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, large ] strategy: fail-fast: false @@ -722,14 +710,14 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.37.1 + VM_BUILDER_VERSION: v0.42.2 steps: - uses: actions/checkout@v4 - name: Downloading vm-builder run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -742,22 +730,25 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -target-arch=linux/amd64 - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image ] + needs: [ check-permissions, meta, neon-image, compute-node-image ] + # Depends on jobs that can get skipped + if: "!failure() && !cancelled()" strategy: fail-fast: false matrix: @@ -775,17 +766,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Get the last compute release tag - id: get-last-compute-release-tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ - -H "Accept: application/vnd.github+json" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "/repos/${{ github.repository }}/releases") - echo tag=${tag} >> ${GITHUB_OUTPUT} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -795,8 +775,9 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -813,7 +794,24 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 env: - TAG: ${{needs.tag.outputs.build-tag}} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} + TEST_EXTENSIONS_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && 'latest' + || needs.meta.outputs.build-tag + }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh @@ -825,10 +823,17 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 - if: ${{ needs.tag.outputs.build-tag == github.run_id }} + if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} env: - NEWTAG: ${{ needs.tag.outputs.build-tag }} - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TAG: >- + ${{ + false + || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag + || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release + }} + TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }} + NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} + OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up @@ -838,7 +843,7 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down generate-image-maps: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 outputs: neon-dev: ${{ steps.generate.outputs.neon-dev }} @@ -848,14 +853,14 @@ jobs: steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/generate_image_maps.py + sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false - name: Generate Image Maps id: generate - run: python scripts/generate_image_maps.py + run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BUILD_TAG: "${{ needs.meta.outputs.build-tag }}" BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -864,88 +869,95 @@ jobs: AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: - needs: [ generate-image-maps, neon-image ] + needs: [ meta, generate-image-maps, neon-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-dev: - needs: [ generate-image-maps, vm-compute-node-image ] + needs: [ meta, generate-image-maps, vm-compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-neon-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, neon-image, test-images ] + needs: [ meta, generate-image-maps, neon-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit push-compute-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, vm-compute-node-image, test-images ] + needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }} uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit # This is a bit of a special case so we're not using a generated image map. add-latest-tag-to-neon-extensions-test-image: if: github.ref_name == 'main' - needs: [ tag, compute-node-image ] + needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] } - secrets: - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit trigger-custom-extensions-build-and-wait: - needs: [ check-permissions, tag ] + needs: [ check-permissions, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -980,7 +992,7 @@ jobs: \"ci_job_name\": \"build-and-upload-extensions\", \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\", \"remote_branch_name\": \"${{ github.ref_name }}\" } }" @@ -1024,9 +1036,9 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() + needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` + if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1037,108 +1049,103 @@ jobs: - uses: actions/checkout@v4 - name: Create git tag and GitHub release - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} uses: actions/github-script@v7 + env: + TAG: "${{ needs.meta.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + PREVIOUS_RELEASE: >- + ${{ + false + || needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release + || needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release + || 'unknown' + }} with: retries: 5 script: | - const tag = "${{ needs.tag.outputs.build-tag }}"; - const branch = "${{ github.ref_name }}"; + const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env try { const existingRef = await github.rest.git.getRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `tags/${tag}`, + ref: `tags/${TAG}`, }); if (existingRef.data.object.sha !== context.sha) { - throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); + throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); } - console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`); + console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Tag ${tag} does not exist. Creating it...`); + console.log(`Tag ${TAG} does not exist. Creating it...`); await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `refs/tags/${tag}`, + ref: `refs/tags/${TAG}`, sha: context.sha, }); - console.log(`Tag ${tag} created successfully.`); + console.log(`Tag ${TAG} created successfully.`); } try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, repo: context.repo.repo, - tag: tag, + tag: TAG, }); - console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`); + console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Release for tag ${tag} does not exist. Creating it...`); + console.log(`Release for tag ${TAG} does not exist. Creating it...`); // Find the PR number using the commit SHA const pullRequests = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'closed', - base: branch, + base: BRANCH, }); const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); const prNumber = pr ? pr.number : null; - // Find the previous release on the branch - const releases = await github.rest.repos.listReleases({ - owner: context.repo.owner, - repo: context.repo.repo, - per_page: 100, - }); - - const branchReleases = releases.data - .filter((release) => { - const regex = new RegExp(`^${branch}-\\d+$`); - return regex.test(release.tag_name) && !release.draft && !release.prerelease; - }) - .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); - - const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; - const releaseNotes = [ prNumber ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` : 'Release PR not found.', - previousTag - ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` - : `No previous release found on branch ${branch}.`, + `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.` ].join('\n\n'); await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, - tag_name: tag, + tag_name: TAG, body: releaseNotes, }); - console.log(`Release for tag ${tag} created successfully.`); + console.log(`Release for tag ${TAG} created successfully.`); } - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + RUN_KIND: ${{ needs.meta.outputs.run-kind }} run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + case ${RUN_KIND} in + push-main) + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false + ;; + storage-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ @@ -1146,7 +1153,7 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ @@ -1154,8 +1161,9 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + proxy-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ @@ -1163,7 +1171,7 @@ jobs: -f deployStorageBroker=false \ -f deployStorageController=false \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ @@ -1173,13 +1181,16 @@ jobs: -f deployProxyScram=true \ -f deployProxyAuthBroker=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + compute-release) + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + *) + echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'" exit 1 - fi + ;; + esac notify-storage-release-deploy-failure: needs: [ deploy ] @@ -1204,7 +1215,7 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read - # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 @@ -1294,7 +1305,8 @@ jobs: pin-build-tools-image: needs: [ build-build-tools-image, test-images, build-and-test-locally ] - if: github.ref_name == 'main' + # `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped + if: github.ref_name == 'main' && !failure() && !cancelled() uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} @@ -1313,6 +1325,7 @@ jobs: # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: + - meta - build-and-test-locally - check-codestyle-python - check-codestyle-rust @@ -1336,7 +1349,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.push-compute-image-dev.result == 'skipped' - || needs.push-neon-image-dev.result == 'skipped' + || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.test-images.result == 'skipped' - || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' + || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 09d6acd325..606e1c0862 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -38,6 +38,9 @@ jobs: runs-on: us-east-2 container: image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index f077e04d1c..90318747b3 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,7 +71,7 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index b305b662ee..d2588ba0bf 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -65,6 +65,7 @@ jobs: permissions: id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR uses: ./.github/workflows/_push-to-container-registry.yml with: @@ -72,12 +73,15 @@ jobs: { "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", + "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", + "ghcr.io/neondatabase/build-tools:pinned-bookworm", + "ghcr.io/neondatabase/build-tools:pinned", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", @@ -85,12 +89,10 @@ jobs: ] } aws-region: ${{ vars.AWS_ECR_REGION }} - aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - secrets: - aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" - docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + secrets: inherit diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index be6a7a7901..a30da35502 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -5,6 +5,10 @@ on: types: - ready_for_review workflow_call: + inputs: + github-event-name: + type: string + required: true defaults: run: @@ -19,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name }} + github-event-name: ${{ inputs.github-event-name || github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -35,46 +39,29 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" - tag: - needs: [ check-permissions ] - runs-on: ubuntu-22.04 - outputs: - build-tag: ${{ steps.build-tag.outputs.tag }} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} - CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') - echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT - fi - id: build-tag + meta: + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ inputs.github-event-name || github.event_name }} trigger-e2e-tests: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 env: EVENT_ACTION: ${{ github.event.action }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - TAG: ${{ needs.tag.outputs.build-tag }} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} steps: - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely @@ -157,6 +144,6 @@ jobs: --raw-field "commit_hash=$COMMIT_SHA" \ --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ --raw-field "storage_image_tag=${TAG}" \ - --raw-field "compute_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${COMPUTE_TAG}" \ --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/Cargo.lock b/Cargo.lock index 038727f1a8..293ed465ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -984,9 +984,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.70.1" +version = "0.71.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" dependencies = [ "bitflags 2.8.0", "cexpr", @@ -997,7 +997,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 2.1.1", "shlex", "syn 2.0.90", ] @@ -1342,7 +1342,9 @@ dependencies = [ "tokio-util", "tower 0.5.2", "tower-http", + "tower-otel", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -1546,6 +1548,17 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "cron" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740" +dependencies = [ + "chrono", + "once_cell", + "winnow", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -3524,7 +3537,7 @@ dependencies = [ "measured-derive", "memchr", "parking_lot 0.12.1", - "rustc-hash", + "rustc-hash 1.1.0", "ryu", ] @@ -4473,18 +4486,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", @@ -4999,7 +5012,7 @@ dependencies = [ "reqwest-tracing", "rsa", "rstest", - "rustc-hash", + "rustc-hash 1.1.0", "rustls 0.23.18", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", @@ -5617,6 +5630,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.0" @@ -6446,6 +6465,7 @@ dependencies = [ "chrono", "clap", "control_plane", + "cron", "diesel", "diesel-async", "diesel_migrations", @@ -7282,6 +7302,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" +[[package]] +name = "tower-otel" +version = "0.2.0" +source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +dependencies = [ + "http 1.1.0", + "opentelemetry", + "pin-project", + "tower-layer", + "tower-service", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tower-service" version = "0.3.3" @@ -8138,9 +8172,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.13" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 21310ce6ec..ff45d46a47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ members = [ ] [workspace.package] -edition = "2021" +edition = "2024" license = "Apache-2.0" ## All dependency versions, used in the project @@ -70,13 +70,14 @@ aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.70" +bindgen = "0.71" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" +cron = "0.15" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } clashmap = { version = "1.0", features = ["raw-api"] } @@ -192,6 +193,10 @@ toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["request-id", "trace"] } + +# This revision uses opentelemetry 0.27. There's no tag for it. +tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } + tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index ef4c22612d..0cdb44853f 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1458,9 +1458,11 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src +COPY compute/patches/duckdb_v113.patch . RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh @@ -1480,6 +1482,7 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_duckdb-src WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . +COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules # allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() @@ -1487,7 +1490,9 @@ COPY compute/patches/pg_duckdb_v031.patch . RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ cd pg_duckdb-src && \ git submodule update --init --recursive && \ - patch -p1 < /ext-src/pg_duckdb_v031.patch + patch -p1 < /ext-src/pg_duckdb_v031.patch && \ + cd third_party/duckdb && \ + patch -p1 < /ext-src/duckdb_v120.patch FROM pg-build AS pg_duckdb-build ARG PG_VERSION @@ -1676,11 +1681,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ - -# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake -# also depends on libduckdb, but a different version. -#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ - +COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1817,7 +1818,7 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute FROM pg-build AS extension-tests ARG PG_VERSION -RUN mkdir /ext-src +COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres #COPY --from=postgis-src /ext-src/ /ext-src/ diff --git a/compute/patches/duckdb_v113.patch b/compute/patches/duckdb_v113.patch new file mode 100644 index 0000000000..b7b43b88bf --- /dev/null +++ b/compute/patches/duckdb_v113.patch @@ -0,0 +1,25 @@ +diff --git a/libduckdb.map b/libduckdb.map +new file mode 100644 +index 0000000000..3b56f00cd7 +--- /dev/null ++++ b/libduckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.1.3 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 3e757a4bcc..88ab4005b9 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -135,6 +135,8 @@ else() + target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) ++ target_link_options(duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) diff --git a/compute/patches/duckdb_v120.patch b/compute/patches/duckdb_v120.patch new file mode 100644 index 0000000000..cf317736a5 --- /dev/null +++ b/compute/patches/duckdb_v120.patch @@ -0,0 +1,67 @@ +diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map +new file mode 100644 +index 0000000000..0872978b48 +--- /dev/null ++++ b/libduckdb_pg_duckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.2.0 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 58adef3fc0..2c522f91be 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -59,7 +59,7 @@ endfunction() + + if(AMALGAMATION_BUILD) + +- add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") ++ add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") + target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) +@@ -109,7 +109,7 @@ else() + duckdb_yyjson + duckdb_zstd) + +- add_library(duckdb SHARED ${ALL_OBJECT_FILES}) ++ add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES}) + + if(WIN32 AND NOT MINGW) + ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION) +@@ -131,9 +131,11 @@ else() + target_sources(duckdb PRIVATE version.rc) + endif() + +- target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) +- link_threads(duckdb) +- link_extension_libraries(duckdb) ++ target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS}) ++ link_threads(duckdb_pg_duckdb) ++ link_extension_libraries(duckdb_pg_duckdb) ++ target_link_options(duckdb_pg_duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) +@@ -141,7 +143,7 @@ else() + link_extension_libraries(duckdb_static) + + target_include_directories( +- duckdb PUBLIC $ ++ duckdb_pg_duckdb PUBLIC $ + $) + + target_include_directories( +@@ -161,7 +163,7 @@ else() + endif() + + install( +- TARGETS duckdb duckdb_static ++ TARGETS duckdb_pg_duckdb duckdb_static + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch index a7e188d69e..edc7fbf69d 100644 --- a/compute/patches/pg_duckdb_v031.patch +++ b/compute/patches/pg_duckdb_v031.patch @@ -1,3 +1,25 @@ +diff --git a/Makefile b/Makefile +index 3235cc8..6b892bc 100644 +--- a/Makefile ++++ b/Makefile +@@ -32,7 +32,7 @@ else + DUCKDB_BUILD_TYPE = release + endif + +-DUCKDB_LIB = libduckdb$(DLSUFFIX) ++DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX) + FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) + + ERROR_ON_WARNING ?= +@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} - + # changes to the vendored code in one place. + override PG_CFLAGS += -Wno-declaration-after-statement + +-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 ++SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4 + + include Makefile.global + diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index d777d76..af60106 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 568f0b0444..ff4c3387d9 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -44,6 +44,11 @@ shutdownHook: | files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 6617c98599..c001040bc9 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -44,6 +44,11 @@ shutdownHook: | files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c276996df5..8f3bcbeef8 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_tools" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -46,7 +46,9 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tower-otel.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 1cdae718fe..6dae1a2753 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -40,34 +40,33 @@ use std::path::Path; use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; -use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::{thread, time::Duration}; +use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc}; +use std::thread; +use std::time::Duration; use anyhow::{Context, Result}; use chrono::Utc; use clap::Parser; -use compute_tools::disk_quota::set_disk_quota; -use compute_tools::http::server::Server; -use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; -use signal_hook::consts::{SIGQUIT, SIGTERM}; -use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info, warn}; -use url::Url; - use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; use compute_api::spec::ComputeSpec; - use compute_tools::compute::{ - forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, + ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal, }; use compute_tools::configurator::launch_configurator; +use compute_tools::disk_quota::set_disk_quota; use compute_tools::extension_server::get_pg_version_string; +use compute_tools::http::server::Server; use compute_tools::logger::*; +use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; -use rlimit::{setrlimit, Resource}; +use rlimit::{Resource, setrlimit}; +use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; +use signal_hook::iterator::Signals; +use tracing::{error, info, warn}; +use url::Url; use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes @@ -149,6 +148,8 @@ struct Cli { fn main() -> Result<()> { let cli = Cli::parse(); + let scenario = failpoint_support::init(); + // For historical reasons, the main thread that processes the spec and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) @@ -160,8 +161,6 @@ fn main() -> Result<()> { let build_tag = runtime.block_on(init())?; - let scenario = failpoint_support::init(); - // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -407,6 +406,21 @@ fn start_postgres( ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + let _this_entered = { + // Temporarily enter the /configure request's span, so that the new span + // becomes its child. + let _parent_entered = state.startup_span.take().map(|p| p.entered()); + + tracing::info_span!("start_postgres") + } + .entered(); + state.set_status(ComputeStatus::Init, &compute.state_changed); info!( diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 585f3e4e1d..47558be7a0 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,13 +25,13 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; +use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; use nix::unistd::Pid; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] @@ -558,7 +558,9 @@ async fn cmd_dumprestore( decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) .await? } else { - bail!("destination connection string must be provided in spec for dump_restore command"); + bail!( + "destination connection string must be provided in spec for dump_restore command" + ); }; (source, dest) diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 1be10b36d6..d8d007da71 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,11 +1,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use tokio::task::JoinSet; +use tracing::{info, warn}; use walkdir::WalkDir; use super::s3_uri::S3Uri; -use tracing::{info, warn}; - const MAX_PARALLEL_UPLOADS: usize = 10; /// Upload all files from 'local' to 'remote' diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index 52bbef420f..cf4dab7c02 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -1,6 +1,7 @@ -use anyhow::Result; use std::str::FromStr; +use anyhow::Result; + /// Struct to hold parsed S3 components #[derive(Debug, Clone, PartialEq, Eq)] pub struct S3Uri { diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 28b10ce21c..2a7f56e6fc 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,18 +1,20 @@ +use std::path::Path; +use std::process::Stdio; +use std::result::Result; +use std::sync::Arc; + +use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; -use std::{path::Path, process::Stdio, result::Result, sync::Arc}; -use tokio::{ - io::{AsyncBufReadExt, BufReader}, - process::Command, - spawn, -}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::spawn; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; -use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); @@ -55,7 +57,7 @@ pub enum SchemaDumpError { pub async fn get_database_schema( compute: &Arc, dbname: &str, -) -> Result>, SchemaDumpError> { +) -> Result> + use<>, SchemaDumpError> { let pgbin = &compute.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 62d61a8bc9..e4207876ac 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Ok, Result}; +use anyhow::{Ok, Result, anyhow}; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d323ea3dcd..c0e28790d6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,57 +1,38 @@ -use std::collections::{HashMap, HashSet}; -use std::env; -use std::fs; -use std::iter::once; -use std::os::unix::fs::{symlink, PermissionsExt}; +use std::collections::HashMap; +use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::AtomicU32; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; +use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::{Database, PgIdent, Role}; +use compute_api::privilege::Privilege; +use compute_api::responses::{ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent}; +use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; -use futures::StreamExt; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use postgres; -use postgres::error::SqlState; use postgres::NoTls; +use postgres::error::SqlState; +use remote_storage::{DownloadError, RemotePath}; +use tokio::spawn; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; - -use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; -use nix::sys::signal::{kill, Signal}; -use remote_storage::{DownloadError, RemotePath}; -use tokio::spawn; - use crate::installed_extensions::get_installed_extensions; -use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; -use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, - CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, - HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, - RunInEachDatabase, -}; -use crate::spec_apply::PerDatabasePhase; -use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, -}; -use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; -use crate::{config, extension_server}; +use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); @@ -116,7 +97,23 @@ pub struct ComputeState { /// compute wasn't used since start. pub last_active: Option>, pub error: Option, + + /// Compute spec. This can be received from the CLI or - more likely - + /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + + /// If the spec is passed by a /configure request, 'startup_span' is the + /// /configure request's tracing span. The main thread enters it when it + /// processes the compute startup, so that the compute startup is considered + /// to be part of the /configure request for tracing purposes. + /// + /// If the request handling thread/task called startup_compute() directly, + /// it would automatically be a child of the request handling span, and we + /// wouldn't need this. But because we use the main thread to perform the + /// startup, and the /configure task just waits for it to finish, we need to + /// set up the span relationship ourselves. + pub startup_span: Option, + pub metrics: ComputeMetrics, } @@ -128,6 +125,7 @@ impl ComputeState { last_active: None, error: None, pspec: None, + startup_span: None, metrics: ComputeMetrics::default(), } } @@ -546,6 +544,7 @@ impl ComputeNode { // Fast path for sync_safekeepers. If they're already synced we get the lsn // in one roundtrip. If not, we should do a full sync_safekeepers. + #[instrument(skip_all)] pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); @@ -776,8 +775,9 @@ impl ComputeNode { Ok(()) } - /// Start Postgres as a child process and manage DBs/roles. - /// After that this will hang waiting on the postmaster process to exit. + /// Start Postgres as a child process and wait for it to start accepting + /// connections. + /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] pub fn start_postgres( @@ -915,388 +915,6 @@ impl ComputeNode { Ok(client) } - /// Apply the spec to the running PostgreSQL instance. - /// The caller can decide to run with multiple clients in parallel, or - /// single mode. Either way, the commands executed will be the same, and - /// only commands run in different databases are parallelized. - #[instrument(skip_all)] - pub fn apply_spec_sql( - &self, - spec: Arc, - conf: Arc, - concurrency: usize, - ) -> Result<()> { - info!("Applying config with max {} concurrency", concurrency); - debug!("Config: {:?}", spec); - - let rt = tokio::runtime::Handle::current(); - rt.block_on(async { - // Proceed with post-startup configuration. Note, that order of operations is important. - let client = Self::get_maintenance_client(&conf).await?; - let spec = spec.clone(); - - let databases = get_existing_dbs_async(&client).await?; - let roles = get_existing_roles_async(&client) - .await? - .into_iter() - .map(|role| (role.name.clone(), role)) - .collect::>(); - - // Check if we need to drop subscriptions before starting the endpoint. - // - // It is important to do this operation exactly once when endpoint starts on a new branch. - // Otherwise, we may drop not inherited, but newly created subscriptions. - // - // We cannot rely only on spec.drop_subscriptions_before_start flag, - // because if for some reason compute restarts inside VM, - // it will start again with the same spec and flag value. - // - // To handle this, we save the fact of the operation in the database - // in the neon.drop_subscriptions_done table. - // If the table does not exist, we assume that the operation was never performed, so we must do it. - // If table exists, we check if the operation was performed on the current timelilne. - // - let mut drop_subscriptions_done = false; - - if spec.drop_subscriptions_before_start { - let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; - let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); - - info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); - - drop_subscriptions_done = match - client.simple_query(&query).await { - Ok(result) => { - matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) - }, - Err(e) => - { - match e.code() { - Some(&SqlState::UNDEFINED_TABLE) => false, - _ => { - // We don't expect any other error here, except for the schema/table not existing - error!("Error checking if drop subscription operation was already performed: {}", e); - return Err(e.into()); - } - } - } - } - }; - - - let jwks_roles = Arc::new( - spec.as_ref() - .local_proxy_config - .iter() - .flat_map(|it| &it.jwks) - .flatten() - .flat_map(|setting| &setting.role_names) - .cloned() - .collect::>(), - ); - - let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { - roles, - dbs: databases, - })); - - // Apply special pre drop database phase. - // NOTE: we use the code of RunInEachDatabase phase for parallelism - // and connection management, but we don't really run it in *each* database, - // only in databases, we're about to drop. - info!("Applying PerDatabase (pre-dropdb) phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - // Run the phase for each database that we're about to drop. - let db_processes = spec - .delta_operations - .iter() - .flatten() - .filter_map(move |op| { - if op.action.as_str() == "delete_db" { - Some(op.name.clone()) - } else { - None - } - }) - .map(|dbname| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - // We only need dbname field for this phase, so set other fields to dummy values - let db = DB::UserDB(Database { - name: dbname.clone(), - owner: "cloud_admin".to_string(), - options: None, - restrict_conn: false, - invalid: false, - }); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - [DropLogicalSubscriptions].to_vec(), - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - if let Err(e) = handle.await? { - // Handle the error case where the database does not exist - // We do not check whether the DB exists or not in the deletion phase, - // so we shouldn't be strict about it in pre-deletion cleanup as well. - if e.to_string().contains("does not exist") { - warn!("Error dropping subscription: {}", e); - } else { - return Err(e); - } - }; - } - - for phase in [ - CreateSuperUser, - DropInvalidDatabases, - RenameRoles, - CreateAndAlterRoles, - RenameAndDeleteDatabases, - CreateAndAlterDatabases, - CreateSchemaNeon, - ] { - info!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - info!("Applying RunInEachDatabase2 phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - let db_processes = spec - .cluster - .databases - .iter() - .map(|db| DB::new(db.clone())) - // include - .chain(once(DB::SystemDB)) - .map(|db| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - let db = db.clone(); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let mut phases = vec![ - DeleteDBRoleReferences, - ChangeSchemaPerms, - HandleAnonExtension, - ]; - - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(DropLogicalSubscriptions); - } - - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - phases, - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - handle.await??; - } - - let mut phases = vec![ - HandleOtherExtensions, - HandleNeonExtension, // This step depends on CreateSchemaNeon - CreateAvailabilityCheck, - DropRoles, - ]; - - // This step depends on CreateSchemaNeon - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(FinalizeDropLogicalSubscriptions); - } - - for phase in phases { - debug!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - Ok::<(), anyhow::Error>(()) - })?; - - Ok(()) - } - - /// Apply SQL migrations of the RunInEachDatabase phase. - /// - /// May opt to not connect to databases that don't have any scheduled - /// operations. The function is concurrency-controlled with the provided - /// semaphore. The caller has to make sure the semaphore isn't exhausted. - async fn apply_spec_sql_db( - spec: Arc, - conf: Arc, - ctx: Arc>, - jwks_roles: Arc>, - concurrency_token: Arc, - db: DB, - subphases: Vec, - ) -> Result<()> { - let _permit = concurrency_token.acquire().await?; - - let mut client_conn = None; - - for subphase in subphases { - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - RunInEachDatabase { - db: db.clone(), - subphase, - }, - // Only connect if apply_operation actually wants a connection. - // It's quite possible this database doesn't need any queries, - // so by not connecting we save time and effort connecting to - // that database. - || async { - if client_conn.is_none() { - let db_client = Self::get_maintenance_client(&conf).await?; - client_conn.replace(db_client); - } - let client = client_conn.as_ref().unwrap(); - Ok(client) - }, - ) - .await?; - } - - drop(client_conn); - - Ok::<(), anyhow::Error>(()) - } - - /// Choose how many concurrent connections to use for applying the spec changes. - pub fn max_service_connections( - &self, - compute_state: &ComputeState, - spec: &ComputeSpec, - ) -> usize { - // If the cluster is in Init state we don't have to deal with user connections, - // and can thus use all `max_connections` connection slots. However, that's generally not - // very efficient, so we generally still limit it to a smaller number. - if compute_state.status == ComputeStatus::Init { - // If the settings contain 'max_connections', use that as template - if let Some(config) = spec.cluster.settings.find("max_connections") { - config.parse::().ok() - } else { - // Otherwise, try to find the setting in the postgresql_conf string - spec.cluster - .postgresql_conf - .iter() - .flat_map(|conf| conf.split("\n")) - .filter_map(|line| { - if !line.contains("max_connections") { - return None; - } - - let (key, value) = line.split_once("=")?; - let key = key - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - let value = value - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - if key != "max_connections" { - return None; - } - - value.parse::().ok() - }) - .next() - } - // If max_connections is present, use at most 1/3rd of that. - // When max_connections is lower than 30, try to use at least 10 connections, but - // never more than max_connections. - .map(|limit| match limit { - 0..10 => limit, - 10..30 => 10, - 30.. => limit / 3, - }) - // If we didn't find max_connections, default to 10 concurrent connections. - .unwrap_or(10) - } else { - // state == Running - // Because the cluster is already in the Running state, we should assume users are - // already connected to the cluster, and high concurrency could negatively - // impact user connectivity. Therefore, we can limit concurrency to the number of - // reserved superuser connections, which users wouldn't be able to use anyway. - spec.cluster - .settings - .find("superuser_reserved_connections") - .iter() - .filter_map(|val| val.parse::().ok()) - .map(|val| if val > 1 { val - 1 } else { 1 }) - .last() - .unwrap_or(3) - } - } - /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -1317,7 +935,7 @@ impl ComputeNode { // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; - if let Some(ref local_proxy) = &spec.clone().local_proxy_config { + if let Some(local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } @@ -1537,7 +1155,9 @@ impl ComputeNode { &postgresql_conf_path, "neon.disable_logical_replication_subscribers=false", )? { - info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); + info!( + "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" + ); } self.pg_reload_conf()?; } @@ -1764,7 +1384,9 @@ LIMIT 100", info!("extension already downloaded, skipping re-download"); return Ok(0); } else if start_time_delta < HANG_TIMEOUT && !first_try { - info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout"); + info!( + "download {ext_archive_name} already started by another process, hanging untill completion or timeout" + ); let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500)); loop { info!("waiting for download"); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e1bdfffa54..e8056ec7eb 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -4,11 +4,10 @@ use std::io::prelude::*; use std::path::Path; use anyhow::Result; - -use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value}; + /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. pub fn line_in_file(path: &Path, line: &str) -> Result { diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index d88f26ca20..d97bd37285 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -1,9 +1,8 @@ use std::sync::Arc; use std::thread; -use tracing::{error, info, instrument}; - use compute_api::responses::ComputeStatus; +use tracing::{error, info, instrument}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs index e838c5b9fd..1353ab938d 100644 --- a/compute_tools/src/disk_quota.rs +++ b/compute_tools/src/disk_quota.rs @@ -1,9 +1,11 @@ use anyhow::Context; +use tracing::instrument; pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; /// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. /// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +#[instrument] pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { let size_kb = size_bytes / 1024; // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 00f46386e7..77e98359ab 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::Result; -use anyhow::{bail, Context}; +use std::path::Path; +use std::str; + +use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; -use std::path::Path; -use std::str; use tar::Archive; use tracing::info; use tracing::log::warn; @@ -244,7 +244,10 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { info!("writing file {:?}{:?}", control_path, control_content); std::fs::write(control_path, control_content).unwrap(); } else { - warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path); + warn!( + "control file {:?} exists both locally and remotely. ignoring the remote version.", + control_path + ); } } } diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs index 104cc25d5f..1d32e4ff37 100644 --- a/compute_tools/src/http/extract/json.rs +++ b/compute_tools/src/http/extract/json.rs @@ -1,6 +1,7 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::JsonRejection, FromRequest, Request}; +use axum::extract::rejection::JsonRejection; +use axum::extract::{FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs index 09637a96a4..45970cff3d 100644 --- a/compute_tools/src/http/extract/path.rs +++ b/compute_tools/src/http/extract/path.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::PathRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::PathRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Path` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs index 9dec3642cf..b8079ea770 100644 --- a/compute_tools/src/http/extract/query.rs +++ b/compute_tools/src/http/extract/query.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::QueryRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::QueryRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Query` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 93eb6ef5b7..d182278174 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,6 +1,8 @@ -use axum::{body::Body, response::Response}; +use axum::body::Body; +use axum::response::Response; use compute_api::responses::{ComputeStatus, GenericAPIError}; -use http::{header::CONTENT_TYPE, StatusCode}; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Serialize; use tracing::error; diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs index d7feb055e9..5a12686fa8 100644 --- a/compute_tools/src/http/routes/check_writability.rs +++ b/compute_tools/src/http/routes/check_writability.rs @@ -1,10 +1,13 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; +use crate::checker::check_writability; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Check that the compute is currently running. pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 2546cbc344..63d428fff4 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,18 +1,16 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ConfigurationRequest, - responses::{ComputeStatus, ComputeStatusResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{ComputeNode, ParsedSpec}, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::{ComputeNode, ParsedSpec}; +use crate::http::JsonResponse; +use crate::http::extract::Json; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -47,13 +45,18 @@ pub(in crate::http) async fn configure( return JsonResponse::invalid_status(state.status); } + // Pass the tracing span to the main thread that performs the startup, + // so that the start_compute operation is considered a child of this + // configure request for tracing purposes. + state.startup_span = Some(tracing::Span::current()); + state.pspec = Some(pspec); state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is - // needed to do not block the main pool of workers and be able to serve + // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs index fd716272dc..1f6ca4b79d 100644 --- a/compute_tools/src/http/routes/database_schema.rs +++ b/compute_tools/src/http/routes/database_schema.rs @@ -1,14 +1,16 @@ use std::sync::Arc; -use axum::{body::Body, extract::State, response::Response}; -use http::{header::CONTENT_TYPE, StatusCode}; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Deserialize; -use crate::{ - catalog::{get_database_schema, SchemaDumpError}, - compute::ComputeNode, - http::{extract::Query, JsonResponse}, -}; +use crate::catalog::{SchemaDumpError, get_database_schema}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Query; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct DatabaseSchemaParams { diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs index 4843c3fab4..790fe0dfe3 100644 --- a/compute_tools/src/http/routes/dbs_and_roles.rs +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -1,9 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; +use crate::catalog::get_dbs_and_roles; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get the databases and roles from the compute. pub(in crate::http) async fn get_catalog_objects( diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 5cc9b6d277..b0265d1e99 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -1,19 +1,13 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::{ - compute::ComputeNode, - http::{ - extract::{Path, Query}, - JsonResponse, - }, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::{Path, Query}; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs index 1fc03b9109..910e1fa155 100644 --- a/compute_tools/src/http/routes/extensions.rs +++ b/compute_tools/src/http/routes/extensions.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ExtensionInstallRequest, - responses::{ComputeStatus, ExtensionInstallResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ExtensionInstallRequest; +use compute_api::responses::{ComputeStatus, ExtensionInstallResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Install a extension. pub(in crate::http) async fn install_extension( diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 836417d784..8f5da99963 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -17,7 +17,8 @@ pub struct FailpointConfig { pub actions: String, } -use crate::http::{extract::Json, JsonResponse}; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Configure failpoints for testing purposes. pub(in crate::http) async fn configure_failpoints( diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs index 3f67f011e5..267dcbb27e 100644 --- a/compute_tools/src/http/routes/grants.rs +++ b/compute_tools/src/http/routes/grants.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::SetRoleGrantsRequest, - responses::{ComputeStatus, SetRoleGrantsResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::SetRoleGrantsRequest; +use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Add grants for a role. pub(in crate::http) async fn add_grant( diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs index 6b03a461c3..b1ba67161e 100644 --- a/compute_tools/src/http/routes/insights.rs +++ b/compute_tools/src/http/routes/insights.rs @@ -1,10 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Collect current Postgres usage insights. pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index 13150a7588..da8d8b20a5 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -1,10 +1,12 @@ -use axum::{body::Body, response::Response}; -use http::header::CONTENT_TYPE; +use axum::body::Body; +use axum::response::Response; use http::StatusCode; +use http::header::CONTENT_TYPE; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; -use crate::{http::JsonResponse, metrics::collect}; +use crate::http::JsonResponse; +use crate::metrics::collect; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs index 0709db5011..bc35ee2645 100644 --- a/compute_tools/src/http/routes/metrics_json.rs +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -1,9 +1,11 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get startup metrics. pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs index d64d53a58f..8ed1299d6b 100644 --- a/compute_tools/src/http/routes/status.rs +++ b/compute_tools/src/http/routes/status.rs @@ -1,9 +1,13 @@ -use std::{ops::Deref, sync::Arc}; +use std::ops::Deref; +use std::sync::Arc; -use axum::{extract::State, http::StatusCode, response::Response}; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::Response; use compute_api::responses::ComputeStatusResponse; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Retrieve the state of the comute. pub(in crate::http) async fn get_status(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 7acd84f236..2c24d4ad6b 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -1,18 +1,14 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use compute_api::responses::ComputeStatus; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{forward_termination_signal, ComputeNode}, - http::JsonResponse, -}; +use crate::compute::{ComputeNode, forward_termination_signal}; +use crate::http::JsonResponse; /// Terminate the compute. pub(in crate::http) async fn terminate(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index a523ecd96f..7283401bb5 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,23 +1,20 @@ -use std::{ - fmt::Display, - net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::Arc, - time::Duration, -}; +use std::fmt::Display; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Result; -use axum::{ - extract::Request, - middleware::{self, Next}, - response::{IntoResponse, Response}, - routing::{get, post}, - Router, -}; +use axum::Router; +use axum::extract::Request; +use axum::middleware::{self, Next}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{get, post}; use http::StatusCode; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; -use tracing::{debug, error, info, Span}; +use tower_http::request_id::PropagateRequestIdLayer; +use tower_http::trace::TraceLayer; +use tracing::{Span, debug, error, info}; use uuid::Uuid; use super::routes::{ @@ -124,6 +121,7 @@ impl From for Router> { ) .layer(PropagateRequestIdLayer::x_request_id()), ) + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) } } diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 173dbf40b0..6921505466 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,7 +1,7 @@ -use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use anyhow::Result; +use compute_api::responses::{InstalledExtension, InstalledExtensions}; use postgres::{Client, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3061d387a5..b4ec675ff4 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -1,17 +1,15 @@ -use anyhow::bail; -use anyhow::Result; -use postgres::{NoTls, SimpleQueryMessage}; -use std::time::SystemTime; -use std::{str::FromStr, sync::Arc, thread, time::Duration}; -use utils::id::TenantId; -use utils::id::TimelineId; +use std::str::FromStr; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, SystemTime}; +use anyhow::{Result, bail}; use compute_api::spec::ComputeMode; +use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 870b294d08..bc96e5074c 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,6 @@ use metrics::core::Collector; use metrics::proto::MetricFamily; -use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 184f380a8d..248505e473 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use std::{thread, time::Duration}; +use std::thread; +use std::time::Duration; use chrono::{DateTime, Utc}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeFeature; use postgres::{Client, NoTls}; use tracing::{debug, error, info, warn}; use crate::compute::ComputeNode; -use compute_api::responses::ComputeStatus; -use compute_api::spec::ComputeFeature; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 86fcf99085..5a2e305e1d 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,7 +9,8 @@ use std::process::Child; use std::str::FromStr; use std::time::{Duration, Instant}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; @@ -21,8 +22,6 @@ use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; -use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; - const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal. diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6f28bd9733..1d19f2738d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,20 +1,20 @@ -use anyhow::{anyhow, bail, Result}; -use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use tokio_postgres::Client; -use tracing::{error, info, instrument, warn}; - -use crate::config; -use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; -use crate::migration::MigrationRunner; -use crate::params::PG_HBA_ALL_MD5; -use crate::pg_helpers::*; +use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, }; use compute_api::spec::ComputeSpec; +use reqwest::StatusCode; +use tokio_postgres::Client; +use tracing::{error, info, instrument, warn}; + +use crate::config; +use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; +use crate::migration::MigrationRunner; +use crate::params::PG_HBA_ALL_MD5; +use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -141,7 +141,6 @@ pub fn get_spec_from_control_plane( /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("checking pg_hba.conf"); let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -156,12 +155,11 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("adding standby.signal"); let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { - info!("created standby.signal"); File::create(signalfile)?; + info!("created standby.signal"); } else { info!("reused pre-existing standby.signal"); } @@ -170,7 +168,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { #[instrument(skip_all)] pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); client.simple_query(query).await?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index c4416480d8..f9a37c5c98 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -1,18 +1,416 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::future::Future; -use std::iter::empty; -use std::iter::once; +use std::iter::{empty, once}; use std::sync::Arc; -use crate::compute::construct_superuser_query; -use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::Result; +use anyhow::{Context, Result}; +use compute_api::responses::ComputeStatus; use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, warn, Instrument}; +use tokio_postgres::error::SqlState; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; + +use crate::compute::{ComputeNode, ComputeState, construct_superuser_query}; +use crate::pg_helpers::{ + DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async, + get_existing_roles_async, +}; +use crate::spec_apply::ApplySpecPhase::{ + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, + CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, + RunInEachDatabase, +}; +use crate::spec_apply::PerDatabasePhase::{ + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, +}; + +impl ComputeNode { + /// Apply the spec to the running PostgreSQL instance. + /// The caller can decide to run with multiple clients in parallel, or + /// single mode. Either way, the commands executed will be the same, and + /// only commands run in different databases are parallelized. + #[instrument(skip_all)] + pub fn apply_spec_sql( + &self, + spec: Arc, + conf: Arc, + concurrency: usize, + ) -> Result<()> { + info!("Applying config with max {} concurrency", concurrency); + debug!("Config: {:?}", spec); + + let rt = tokio::runtime::Handle::current(); + rt.block_on(async { + // Proceed with post-startup configuration. Note, that order of operations is important. + let client = Self::get_maintenance_client(&conf).await?; + let spec = spec.clone(); + + let databases = get_existing_dbs_async(&client).await?; + let roles = get_existing_roles_async(&client) + .await? + .into_iter() + .map(|role| (role.name.clone(), role)) + .collect::>(); + + // Check if we need to drop subscriptions before starting the endpoint. + // + // It is important to do this operation exactly once when endpoint starts on a new branch. + // Otherwise, we may drop not inherited, but newly created subscriptions. + // + // We cannot rely only on spec.drop_subscriptions_before_start flag, + // because if for some reason compute restarts inside VM, + // it will start again with the same spec and flag value. + // + // To handle this, we save the fact of the operation in the database + // in the neon.drop_subscriptions_done table. + // If the table does not exist, we assume that the operation was never performed, so we must do it. + // If table exists, we check if the operation was performed on the current timelilne. + // + let mut drop_subscriptions_done = false; + + if spec.drop_subscriptions_before_start { + let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; + let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); + + info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); + + drop_subscriptions_done = match + client.simple_query(&query).await { + Ok(result) => { + matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) + }, + Err(e) => + { + match e.code() { + Some(&SqlState::UNDEFINED_TABLE) => false, + _ => { + // We don't expect any other error here, except for the schema/table not existing + error!("Error checking if drop subscription operation was already performed: {}", e); + return Err(e.into()); + } + } + } + } + }; + + + let jwks_roles = Arc::new( + spec.as_ref() + .local_proxy_config + .iter() + .flat_map(|it| &it.jwks) + .flatten() + .flat_map(|setting| &setting.role_names) + .cloned() + .collect::>(), + ); + + let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { + roles, + dbs: databases, + })); + + // Apply special pre drop database phase. + // NOTE: we use the code of RunInEachDatabase phase for parallelism + // and connection management, but we don't really run it in *each* database, + // only in databases, we're about to drop. + info!("Applying PerDatabase (pre-dropdb) phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Run the phase for each database that we're about to drop. + let db_processes = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + if op.action.as_str() == "delete_db" { + Some(op.name.clone()) + } else { + None + } + }) + .map(|dbname| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + // We only need dbname field for this phase, so set other fields to dummy values + let db = DB::UserDB(Database { + name: dbname.clone(), + owner: "cloud_admin".to_string(), + options: None, + restrict_conn: false, + invalid: false, + }); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + [DropLogicalSubscriptions].to_vec(), + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + if let Err(e) = handle.await? { + // Handle the error case where the database does not exist + // We do not check whether the DB exists or not in the deletion phase, + // so we shouldn't be strict about it in pre-deletion cleanup as well. + if e.to_string().contains("does not exist") { + warn!("Error dropping subscription: {}", e); + } else { + return Err(e); + } + }; + } + + for phase in [ + CreateSuperUser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + CreateSchemaNeon, + ] { + info!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + info!("Applying RunInEachDatabase2 phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + let db_processes = spec + .cluster + .databases + .iter() + .map(|db| DB::new(db.clone())) + // include + .chain(once(DB::SystemDB)) + .map(|db| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + let db = db.clone(); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let mut phases = vec![ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ]; + + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(DropLogicalSubscriptions); + } + + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + phases, + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + handle.await??; + } + + let mut phases = vec![ + HandleOtherExtensions, + HandleNeonExtension, // This step depends on CreateSchemaNeon + CreateAvailabilityCheck, + DropRoles, + ]; + + // This step depends on CreateSchemaNeon + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(FinalizeDropLogicalSubscriptions); + } + + for phase in phases { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + Ok::<(), anyhow::Error>(()) + })?; + + Ok(()) + } + + /// Apply SQL migrations of the RunInEachDatabase phase. + /// + /// May opt to not connect to databases that don't have any scheduled + /// operations. The function is concurrency-controlled with the provided + /// semaphore. The caller has to make sure the semaphore isn't exhausted. + async fn apply_spec_sql_db( + spec: Arc, + conf: Arc, + ctx: Arc>, + jwks_roles: Arc>, + concurrency_token: Arc, + db: DB, + subphases: Vec, + ) -> Result<()> { + let _permit = concurrency_token.acquire().await?; + + let mut client_conn = None; + + for subphase in subphases { + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + RunInEachDatabase { + db: db.clone(), + subphase, + }, + // Only connect if apply_operation actually wants a connection. + // It's quite possible this database doesn't need any queries, + // so by not connecting we save time and effort connecting to + // that database. + || async { + if client_conn.is_none() { + let db_client = Self::get_maintenance_client(&conf).await?; + client_conn.replace(db_client); + } + let client = client_conn.as_ref().unwrap(); + Ok(client) + }, + ) + .await?; + } + + drop(client_conn); + + Ok::<(), anyhow::Error>(()) + } + + /// Choose how many concurrent connections to use for applying the spec changes. + pub fn max_service_connections( + &self, + compute_state: &ComputeState, + spec: &ComputeSpec, + ) -> usize { + // If the cluster is in Init state we don't have to deal with user connections, + // and can thus use all `max_connections` connection slots. However, that's generally not + // very efficient, so we generally still limit it to a smaller number. + if compute_state.status == ComputeStatus::Init { + // If the settings contain 'max_connections', use that as template + if let Some(config) = spec.cluster.settings.find("max_connections") { + config.parse::().ok() + } else { + // Otherwise, try to find the setting in the postgresql_conf string + spec.cluster + .postgresql_conf + .iter() + .flat_map(|conf| conf.split("\n")) + .filter_map(|line| { + if !line.contains("max_connections") { + return None; + } + + let (key, value) = line.split_once("=")?; + let key = key + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + let value = value + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + if key != "max_connections" { + return None; + } + + value.parse::().ok() + }) + .next() + } + // If max_connections is present, use at most 1/3rd of that. + // When max_connections is lower than 30, try to use at least 10 connections, but + // never more than max_connections. + .map(|limit| match limit { + 0..10 => limit, + 10..30 => 10, + 30.. => limit / 3, + }) + // If we didn't find max_connections, default to 10 concurrent connections. + .unwrap_or(10) + } else { + // state == Running + // Because the cluster is already in the Running state, we should assume users are + // already connected to the cluster, and high concurrency could negatively + // impact user connectivity. Therefore, we can limit concurrency to the number of + // reserved superuser connections, which users wouldn't be able to use anyway. + spec.cluster + .settings + .find("superuser_reserved_connections") + .iter() + .filter_map(|val| val.parse::().ok()) + .map(|val| if val > 1 { val - 1 } else { 1 }) + .last() + .unwrap_or(3) + } + } +} #[derive(Clone)] pub enum DB { @@ -474,7 +872,10 @@ async fn get_operations<'a>( let edb = match databases.get(&db.name) { Some(edb) => edb, None => { - warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name); + warn!( + "skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", + subphase, db.name + ); return Ok(Box::new(empty())); } }; diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index 024c5b338e..ed27a7cba4 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,10 +1,11 @@ use std::path::Path; -use anyhow::{anyhow, Context}; -use tracing::warn; +use anyhow::{Context, anyhow}; +use tracing::{instrument, warn}; pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; +#[instrument] pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { // run `/neonvm/bin/resize-swap --once {size_bytes}` // diff --git a/compute_tools/tests/config_test.rs b/compute_tools/tests/config_test.rs index 9ab16b1930..7b2bff23d5 100644 --- a/compute_tools/tests/config_test.rs +++ b/compute_tools/tests/config_test.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod config_tests { - use std::fs::{remove_file, File}; + use std::fs::{File, remove_file}; use std::io::{Read, Write}; use std::path::Path; diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index c668e68402..1eac4f7ff0 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -25,7 +25,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno; use nix::fcntl::{FcntlArg, FdFlag}; -use nix::sys::signal::{kill, Signal}; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use utils::pid_file::{self, PidFileRead}; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 02d793400a..f258025428 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -5,7 +5,16 @@ //! easier to work with locally. The python tests in `test_runner` //! rely on `neon_local` to set up the environment for each test. //! -use anyhow::{anyhow, bail, Context, Result}; +use std::borrow::Cow; +use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; +use std::path::PathBuf; +use std::process::exit; +use std::str::FromStr; +use std::time::Duration; + +use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; @@ -19,7 +28,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; -use nix::fcntl::{flock, FlockArg}; +use nix::fcntl::{FlockArg, flock}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -35,23 +44,13 @@ use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; -use std::borrow::Cow; -use std::collections::{BTreeSet, HashMap}; -use std::fs::File; -use std::os::fd::AsRawFd; -use std::path::PathBuf; -use std::process::exit; -use std::str::FromStr; -use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; use url::Host; -use utils::{ - auth::{Claims, Scope}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - project_git_version, -}; +use utils::auth::{Claims, Scope}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::project_git_version; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); @@ -887,20 +886,6 @@ fn print_timeline( Ok(()) } -/// Returns a map of timeline IDs to timeline_id@lsn strings. -/// Connects to the pageserver to query this information. -async fn get_timeline_infos( - env: &local_env::LocalEnv, - tenant_shard_id: &TenantShardId, -) -> Result> { - Ok(get_default_pageserver(env) - .timeline_list(tenant_shard_id) - .await? - .into_iter() - .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) - .collect()) -} - /// Helper function to get tenant id from an optional --tenant_id option or from the config file fn get_tenant_id( tenant_id_arg: Option, @@ -935,7 +920,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. if args.num_pageservers.is_some() { - bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + bail!( + "Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead" + ); } // load and parse the file let contents = std::fs::read_to_string(config_path).with_context(|| { @@ -1251,12 +1238,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_shard_id) - .await - .unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -1285,12 +1266,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res lsn.to_string() } _ => { - // -> primary endpoint or hot replica - // Use the LSN at the end of the timeline. - timeline_infos - .get(&endpoint.timeline_id) - .map(|bi| bi.last_record_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()) + // As the LSN here refers to the one that the compute is started with, + // we display nothing as it is a primary/hot standby compute. + "---".to_string() } }; @@ -1338,10 +1316,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { - bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + bail!( + "Cannot start a node in hot standby mode when it is already configured as a static replica" + ) } (ComputeMode::Primary, true) => { - bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + bail!( + "Cannot start a node as a hot standby replica, it is already configured as primary node" + ) } _ => {} } diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index c8ac5d8981..1b507bb384 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -8,7 +8,6 @@ use std::time::Duration; use anyhow::Context; - use camino::Utf8PathBuf; use crate::{background_process, local_env}; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 407578abb8..50ccca36fe 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,27 +37,20 @@ //! ``` //! use std::collections::BTreeMap; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::net::TcpStream; +use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; use std::process::Command; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; -use std::time::UNIX_EPOCH; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; -use compute_api::responses::ComputeCtlConfig; -use compute_api::spec::Database; -use compute_api::spec::PgIdent; -use compute_api::spec::RemoteExtSpec; -use compute_api::spec::Role; -use nix::sys::signal::kill; -use nix::sys::signal::Signal; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::{ + Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role, +}; +use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; use serde::{Deserialize, Serialize}; @@ -69,9 +62,6 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use crate::storage_controller::StorageController; -use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; -use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; - // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { @@ -237,7 +227,9 @@ impl ComputeControlPlane { }); if let Some((key, _)) = duplicates.next() { - bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."); + bail!( + "attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported." + ); } } Ok(()) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2fe4cd5202..f4026efbbf 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,28 +3,22 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use std::collections::HashMap; +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Duration; +use std::{env, fs}; +use anyhow::{Context, bail}; use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::env; -use std::fs; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::time::Duration; -use utils::{ - auth::{encode_from_key_file, Claims}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, -}; +use utils::auth::{Claims, encode_from_key_file}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; -use crate::pageserver::PageServerNode; -use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; +use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 16; @@ -465,7 +459,9 @@ impl LocalEnv { if old_timeline_id == &timeline_id { Ok(()) } else { - bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + bail!( + "branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}" + ); } } else { existing_values.push((tenant_id, timeline_id)); diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2bf89b7bfa..39656bdbbe 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -7,7 +7,6 @@ //! ``` //! use std::collections::HashMap; - use std::io; use std::io::Write; use std::num::NonZeroU64; @@ -15,22 +14,19 @@ use std::path::PathBuf; use std::str::FromStr; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use camino::Utf8PathBuf; use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; -use postgres_connection::{parse_host_port, PgConnectionConfig}; +use postgres_connection::{PgConnectionConfig, parse_host_port}; use utils::auth::{Claims, Scope}; -use utils::id::NodeId; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; -use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; -use crate::{background_process, local_env::LocalEnv}; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; @@ -81,7 +77,11 @@ impl PageServerNode { &self, conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { - assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + assert_eq!( + &PageServerConf::from(&conf), + &self.conf, + "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully" + ); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 5aee12dc97..a824af9490 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -1,3 +1,6 @@ +use std::collections::HashMap; +use std::fmt; + /// /// Module for parsing postgresql.conf file. /// @@ -6,8 +9,6 @@ /// funny stuff like include-directives or funny escaping. use once_cell::sync::Lazy; use regex::Regex; -use std::collections::HashMap; -use std::fmt; /// In-memory representation of a postgresql.conf file #[derive(Default, Debug)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index ce7751fb14..70915d5aaf 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,18 +14,15 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; +use http_utils::error::HttpErrorBody; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; - -use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; use utils::id::NodeId; -use crate::{ - background_process, - local_env::{LocalEnv, SafekeeperConf}, -}; +use crate::background_process; +use crate::local_env::{LocalEnv, SafekeeperConf}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0fadb9c5fe..16e12f4e02 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,44 +1,39 @@ -use crate::{ - background_process, - local_env::{LocalEnv, NeonStorageControllerConf}, -}; +use std::ffi::OsStr; +use std::fs; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::process::ExitStatus; +use std::str::FromStr; +use std::sync::OnceLock; +use std::time::{Duration, Instant}; + use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; -use pageserver_api::{ - controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, - TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardStripeSize, TenantShardId}, +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; +use pageserver_api::models::{ + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, +}; +use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{ - ffi::OsStr, - fs, - net::SocketAddr, - path::PathBuf, - process::ExitStatus, - str::FromStr, - sync::OnceLock, - time::{Duration, Instant}, -}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::instrument; use url::Url; -use utils::{ - auth::{encode_from_key_file, Claims, Scope}, - id::{NodeId, TenantId}, -}; +use utils::auth::{Claims, Scope, encode_from_key_file}; +use utils::id::{NodeId, TenantId}; use whoami::username; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonStorageControllerConf}; + pub struct StorageController { env: LocalEnv, private_key: Option>, @@ -96,7 +91,8 @@ pub struct AttachHookRequest { #[derive(Serialize, Deserialize)] pub struct AttachHookResponse { - pub gen: Option, + #[serde(rename = "gen")] + pub generation: Option, } #[derive(Serialize, Deserialize)] @@ -779,7 +775,7 @@ impl StorageController { ) .await?; - Ok(response.gen) + Ok(response.generation) } #[instrument(skip(self))] diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 40b86e4110..2e2c22c791 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,34 +1,27 @@ -use futures::StreamExt; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::time::Duration; use clap::{Parser, Subcommand}; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, - TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, - }, - models::{ - EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, - TenantShardSplitRequest, TenantShardSplitResponse, - }, - shard::{ShardStripeSize, TenantShardId}, +use futures::StreamExt; +use pageserver_api::controller_api::{ + AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy, + SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, + ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, + TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; +use pageserver_api::models::{ + EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters, + TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, +}; +use pageserver_api::shard::{ShardStripeSize, TenantShardId}; use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId, TimelineId}; - -use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - TenantShardMigrateRequest, TenantShardMigrateResponse, -}; use storage_controller_client::control_api::Client; +use utils::id::{NodeId, TenantId, TimelineId}; #[derive(Subcommand, Debug)] enum Command { @@ -921,7 +914,9 @@ async fn main() -> anyhow::Result<()> { } Command::TenantDrop { tenant_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>( @@ -933,7 +928,9 @@ async fn main() -> anyhow::Result<()> { } Command::NodeDrop { node_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 489d60f38c..95d4ff7b2a 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 5b3cfc74eb..0f03d600a3 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -51,8 +51,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do done if [ $pg_version -ge 16 ]; then - docker cp ext-src $TEST_CONTAINER_NAME:/ - docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch index a4c46e93ce..c050ab8d00 100644 --- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch +++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch @@ -7,7 +7,7 @@ index f255fe6..0a0fa65 100644 GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe -REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) -+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) ++REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets! IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=)) PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS))) diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index 06d351b496..57c0182162 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -6,12 +6,13 @@ generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } -if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then - echo OLDTAG and NEWTAG must be defined +if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then + echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined exit 1 fi export PG_VERSION=${PG_VERSION:-16} export PG_TEST_VERSION=${PG_VERSION} +# Waits for compute node is ready function wait_for_ready { TIME=0 while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do @@ -23,11 +24,45 @@ function wait_for_ready { exit 2 fi } +# Creates extensions. Gets a string with space-separated extensions as a parameter function create_extensions() { for ext in ${1}; do docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" done } +# Creates a new timeline. Gets the parent ID and an extension name as parameters. +# Saves the timeline ID in the variable EXT_TIMELINE +function create_timeline() { + generate_id new_timeline_id + + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}" + "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + EXT_TIMELINE[${2}]=${new_timeline_id} +} +# Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter +function check_timeline() { + TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + if [ "${TID}" != "${1}" ]; then + echo Timeline mismatch + exit 1 + fi +} +# Restarts the compute node with the required compute tag and timeline. +# Accepts the tag for the compute node and the timeline as parameters. +function restart_compute() { + docker compose down compute compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + wait_for_ready + check_timeline ${2} +} +declare -A EXT_TIMELINE EXTENSIONS='[ {"extname": "plv8", "extdir": "plv8-src"}, {"extname": "vector", "extdir": "pgvector-src"}, @@ -47,7 +82,7 @@ EXTENSIONS='[ {"extname": "pg_repack", "extdir": "pg_repack-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) -TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" @@ -55,13 +90,14 @@ create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down -TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready -docker compose cp ext-src neon-test-extensions:/ docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" -docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression" -docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap" +tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") +EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") +create_timeline "${EXT_TIMELINE["main"]}" init +restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}" create_extensions "${EXTNAMES}" if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then exts="${EXTNAMES}" @@ -72,29 +108,13 @@ fi if [ -z "${exts}" ]; then echo "No extensions were upgraded" else - tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") - timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") for ext in ${exts}; do echo Testing ${ext}... + create_timeline "${EXT_TIMELINE["main"]}" ${ext} EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') - generate_id new_timeline_id - PARAMS=( - -sbf - -X POST - -H "Content-Type: application/json" - -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}" - "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" - ) - result=$(curl "${PARAMS[@]}") - echo $result | jq . - TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready - COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready - wait_for_ready - TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") - if [ ${TID} != ${new_timeline_id} ]; then - echo Timeline mismatch - exit 1 - fi + restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" + docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE" + restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c11a1b6688..0d1618c1b2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 0c256cae2e..3fbdfcf83f 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,11 +1,10 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use crate::{ - privilege::Privilege, - responses::ComputeCtlConfig, - spec::{ComputeSpec, ExtVersion, PgIdent}, -}; use serde::{Deserialize, Serialize}; +use crate::privilege::Privilege; +use crate::responses::ComputeCtlConfig; +use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a6248019d9..35c580bd37 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,10 +6,8 @@ use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; -use crate::{ - privilege::Privilege, - spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, -}; +use crate::privilege::Privilege; +use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 8fffae92fb..d02bfd6814 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,13 +5,12 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use regex::Regex; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use regex::Regex; -use remote_storage::RemotePath; - /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; @@ -339,9 +338,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use std::fs::File; + use super::*; + #[test] fn allow_installing_remote_extensions() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 0e517e3856..77f130950e 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "consumption_metrics" version = "0.1.0" -edition = "2021" +edition = "2024" license = "Apache-2.0" [dependencies] diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs index 6661d59871..8882cd3b56 100644 --- a/libs/desim/src/chan.rs +++ b/libs/desim/src/chan.rs @@ -1,4 +1,5 @@ -use std::{collections::VecDeque, sync::Arc}; +use std::collections::VecDeque; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index 9d44bd7741..df8b071c06 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -1,11 +1,7 @@ -use std::{ - panic::AssertUnwindSafe, - sync::{ - atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, - mpsc, Arc, OnceLock, - }, - thread::JoinHandle, -}; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, Ordering}; +use std::sync::{Arc, OnceLock, mpsc}; +use std::thread::JoinHandle; use tracing::{debug, error, trace}; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs index e15a714daa..cf096dba80 100644 --- a/libs/desim/src/network.rs +++ b/libs/desim/src/network.rs @@ -1,26 +1,19 @@ -use std::{ - cmp::Ordering, - collections::{BinaryHeap, VecDeque}, - fmt::{self, Debug}, - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::{self, Debug}; +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; -use parking_lot::{ - lock_api::{MappedMutexGuard, MutexGuard}, - Mutex, RawMutex, -}; +use parking_lot::lock_api::{MappedMutexGuard, MutexGuard}; +use parking_lot::{Mutex, RawMutex}; use rand::rngs::StdRng; use tracing::debug; -use crate::{ - executor::{self, ThreadContext}, - options::NetworkOptions, - proto::NetEvent, - proto::NodeEvent, -}; - -use super::{chan::Chan, proto::AnyMessage}; +use super::chan::Chan; +use super::proto::AnyMessage; +use crate::executor::{self, ThreadContext}; +use crate::options::NetworkOptions; +use crate::proto::{NetEvent, NodeEvent}; pub struct NetworkTask { options: Arc, diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs index 7744a9f5e1..e0cde7b284 100644 --- a/libs/desim/src/node_os.rs +++ b/libs/desim/src/node_os.rs @@ -2,14 +2,11 @@ use std::sync::Arc; use rand::Rng; +use super::chan::Chan; +use super::network::TCP; +use super::world::{Node, NodeId, World}; use crate::proto::NodeEvent; -use super::{ - chan::Chan, - network::TCP, - world::{Node, NodeId, World}, -}; - /// Abstraction with all functions (aka syscalls) available to the node. #[derive(Clone)] pub struct NodeOs { diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs index 5da7c2c482..9b1a42fd28 100644 --- a/libs/desim/src/options.rs +++ b/libs/desim/src/options.rs @@ -1,4 +1,5 @@ -use rand::{rngs::StdRng, Rng}; +use rand::Rng; +use rand::rngs::StdRng; /// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. /// Connection failure will occur with the probablity fail_prob. diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs index 92a7e8a27d..31bc29e6a6 100644 --- a/libs/desim/src/proto.rs +++ b/libs/desim/src/proto.rs @@ -3,7 +3,8 @@ use std::fmt::Debug; use bytes::Bytes; use utils::lsn::Lsn; -use crate::{network::TCP, world::NodeId}; +use crate::network::TCP; +use crate::world::NodeId; /// Internal node events. #[derive(Debug)] diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7ce605bda8..350d182cc3 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -1,12 +1,8 @@ -use std::{ - cmp::Ordering, - collections::BinaryHeap, - ops::DerefMut, - sync::{ - atomic::{AtomicU32, AtomicU64}, - Arc, - }, -}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, AtomicU64}; use parking_lot::Mutex; use tracing::trace; diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs index 7d60be04b5..576ba89cd7 100644 --- a/libs/desim/src/world.rs +++ b/libs/desim/src/world.rs @@ -1,19 +1,18 @@ +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; + use parking_lot::Mutex; -use rand::{rngs::StdRng, SeedableRng}; -use std::{ - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use rand::SeedableRng; +use rand::rngs::StdRng; -use crate::{ - executor::{ExternalHandle, Runtime}, - network::NetworkTask, - options::NetworkOptions, - proto::{NodeEvent, SimEvent}, - time::Timing, -}; - -use super::{chan::Chan, network::TCP, node_os::NodeOs}; +use super::chan::Chan; +use super::network::TCP; +use super::node_os::NodeOs; +use crate::executor::{ExternalHandle, Runtime}; +use crate::network::NetworkTask; +use crate::options::NetworkOptions; +use crate::proto::{NodeEvent, SimEvent}; +use crate::time::Timing; pub type NodeId = u32; diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs index cf7bff8f5a..1ddf9844de 100644 --- a/libs/desim/tests/reliable_copy_test.rs +++ b/libs/desim/tests/reliable_copy_test.rs @@ -1,14 +1,15 @@ //! Simple test to verify that simulator is working. #[cfg(test)] mod reliable_copy_test { + use std::sync::Arc; + use anyhow::Result; use desim::executor::{self, PollSome}; + use desim::node_os::NodeOs; use desim::options::{Delay, NetworkOptions}; - use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::proto::{AnyMessage, NetEvent, NodeEvent, ReplCell}; use desim::world::{NodeId, World}; - use desim::{node_os::NodeOs, proto::AnyMessage}; use parking_lot::Mutex; - use std::sync::Arc; use tracing::info; /// Disk storage trait and implementation. diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index be97b341d1..6128113580 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,30 +1,30 @@ -use crate::error::{api_error_handler, route_error_handler, ApiError}; -use crate::pprof; -use crate::request::{get_query_param, parse_query_param}; -use ::pprof::protos::Message as _; -use ::pprof::ProfilerGuardBuilder; -use anyhow::{anyhow, Context}; -use bytes::{Bytes, BytesMut}; -use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; -use hyper::http::HeaderValue; -use hyper::Method; -use hyper::{header::CONTENT_TYPE, Body, Request, Response}; -use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; -use once_cell::sync::Lazy; -use regex::Regex; -use routerify::ext::RequestExt; -use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tokio::sync::{mpsc, Mutex, Notify}; -use tokio_stream::wrappers::ReceiverStream; -use tokio_util::io::ReaderStream; -use tracing::{debug, info, info_span, warn, Instrument}; -use utils::auth::{AuthError, Claims, SwappableJwtAuth}; - use std::future::Future; use std::io::Write as _; use std::str::FromStr; use std::time::Duration; +use ::pprof::ProfilerGuardBuilder; +use ::pprof::protos::Message as _; +use anyhow::{Context, anyhow}; +use bytes::{Bytes, BytesMut}; +use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName}; +use hyper::http::HeaderValue; +use hyper::{Body, Method, Request, Response}; +use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter}; +use once_cell::sync::Lazy; +use regex::Regex; +use routerify::ext::RequestExt; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{Mutex, Notify, mpsc}; +use tokio_stream::wrappers::ReceiverStream; +use tokio_util::io::ReaderStream; +use tracing::{Instrument, debug, info, info_span, warn}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; + +use crate::error::{ApiError, api_error_handler, route_error_handler}; +use crate::pprof; +use crate::request::{get_query_param, parse_query_param}; + static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -375,7 +375,7 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A Err(_) => { return Err(ApiError::Conflict( "profiler already running (use ?force=true to cancel it)".into(), - )) + )); } } tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait @@ -539,8 +539,8 @@ pub async fn profile_heap_handler(req: Request) -> Result, } } -pub fn add_request_id_middleware( -) -> Middleware { +pub fn add_request_id_middleware() +-> Middleware { Middleware::pre(move |req| async move { let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { Some(request_id) => request_id @@ -664,7 +664,7 @@ pub fn auth_middleware( None => { return Err(ApiError::Unauthorized( "missing authorization header".to_string(), - )) + )); } } } @@ -717,12 +717,14 @@ pub fn check_permission_with( #[cfg(test)] mod tests { - use super::*; - use hyper::service::Service; - use routerify::RequestServiceBuilder; use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; + use hyper::service::Service; + use routerify::RequestServiceBuilder; + + use super::*; + #[tokio::test] async fn test_request_id_returned() { let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); diff --git a/libs/http-utils/src/error.rs b/libs/http-utils/src/error.rs index 746305caec..f790dc26ca 100644 --- a/libs/http-utils/src/error.rs +++ b/libs/http-utils/src/error.rs @@ -1,10 +1,10 @@ -use hyper::{header, Body, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::error::Error as StdError; + +use hyper::{Body, Response, StatusCode, header}; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tracing::{error, info, warn}; - use utils::auth::AuthError; #[derive(Debug, Error)] diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs index 8a1e0c8cf0..984823f4a9 100644 --- a/libs/http-utils/src/failpoints.rs +++ b/libs/http-utils/src/failpoints.rs @@ -1,12 +1,11 @@ -use crate::error::ApiError; -use crate::json::{json_request, json_response}; - use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; - use utils::failpoint_support::apply_failpoint; +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point diff --git a/libs/http-utils/src/json.rs b/libs/http-utils/src/json.rs index e53231f313..14ebac91e6 100644 --- a/libs/http-utils/src/json.rs +++ b/libs/http-utils/src/json.rs @@ -1,6 +1,6 @@ use anyhow::Context; use bytes::Buf; -use hyper::{header, Body, Request, Response, StatusCode}; +use hyper::{Body, Request, Response, StatusCode, header}; use serde::{Deserialize, Serialize}; use super::error::ApiError; diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index ae6a27aaa8..c692a54257 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -9,4 +9,4 @@ extern crate hyper0 as hyper; /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. -pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; +pub use routerify::{RouterBuilder, RouterService, ext::RequestExt}; diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs index fe1cc10838..529017f350 100644 --- a/libs/http-utils/src/pprof.rs +++ b/libs/http-utils/src/pprof.rs @@ -1,15 +1,15 @@ -use anyhow::bail; -use flate2::write::{GzDecoder, GzEncoder}; -use flate2::Compression; -use itertools::Itertools as _; -use pprof::protos::{Function, Line, Location, Message as _, Profile}; -use regex::Regex; - use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::ffi::c_void; use std::io::Write as _; +use anyhow::bail; +use flate2::Compression; +use flate2::write::{GzDecoder, GzEncoder}; +use itertools::Itertools as _; +use pprof::protos::{Function, Line, Location, Message as _, Profile}; +use regex::Regex; + /// Decodes a gzip-compressed Protobuf-encoded pprof profile. pub fn decode(bytes: &[u8]) -> anyhow::Result { let mut gz = GzDecoder::new(Vec::new()); diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 7ea71685ec..9024a90a82 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -1,10 +1,13 @@ use core::fmt; -use std::{borrow::Cow, str::FromStr}; +use std::borrow::Cow; +use std::str::FromStr; + +use anyhow::anyhow; +use hyper::body::HttpBody; +use hyper::{Body, Request}; +use routerify::ext::RequestExt; use super::error::ApiError; -use anyhow::anyhow; -use hyper::{body::HttpBody, Body, Request}; -use routerify::ext::RequestExt; pub fn get_request_param<'a>( request: &'a Request, diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index 723916a742..93f6a2b7cc 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -6,17 +6,15 @@ //! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, //! use significantly less memory than this, but can only approximate the cardinality. -use std::{ - hash::{BuildHasher, BuildHasherDefault, Hash}, - sync::atomic::AtomicU8, -}; +use std::hash::{BuildHasher, BuildHasherDefault, Hash}; +use std::sync::atomic::AtomicU8; -use measured::{ - label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, - metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, - text::TextEncoder, - LabelGroup, -}; +use measured::LabelGroup; +use measured::label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}; +use measured::metric::counter::CounterState; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{Metric, MetricType, MetricVec}; +use measured::text::TextEncoder; use twox_hash::xxh3; /// Create an [`HyperLogLogVec`] and registers to default registry. @@ -27,9 +25,7 @@ macro_rules! register_hll_vec { $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) }}; - ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ - $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) - }}; + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) }}; } /// Create an [`HyperLogLog`] and registers to default registry. @@ -40,9 +36,7 @@ macro_rules! register_hll { $crate::register(Box::new(hll.clone())).map(|_| hll) }}; - ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ - $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) - }}; + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) }}; } /// HLL is a probabilistic cardinality measure. @@ -195,8 +189,10 @@ impl measured::metric::MetricEncoding); diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 0f6c2a0937..4df8d7bc51 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,38 +4,26 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] -use measured::{ - label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, - metric::{ - counter::CounterState, - gauge::GaugeState, - group::Encoding, - name::{MetricName, MetricNameEncoder}, - MetricEncoding, MetricFamilyEncoding, - }, - FixedCardinalityLabel, LabelGroup, MetricGroup, -}; +use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; +use measured::metric::counter::CounterState; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::{MetricName, MetricNameEncoder}; +use measured::metric::{MetricEncoding, MetricFamilyEncoding}; +use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use once_cell::sync::Lazy; +use prometheus::Registry; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, }; pub use prometheus::local::LocalHistogram; -pub use prometheus::opts; -pub use prometheus::register; -pub use prometheus::Error; -use prometheus::Registry; -pub use prometheus::{core, default_registry, proto}; -pub use prometheus::{exponential_buckets, linear_buckets}; -pub use prometheus::{register_counter_vec, Counter, CounterVec}; -pub use prometheus::{register_gauge, Gauge}; -pub use prometheus::{register_gauge_vec, GaugeVec}; -pub use prometheus::{register_histogram, Histogram}; -pub use prometheus::{register_histogram_vec, HistogramVec}; -pub use prometheus::{register_int_counter, IntCounter}; -pub use prometheus::{register_int_counter_vec, IntCounterVec}; -pub use prometheus::{register_int_gauge, IntGauge}; -pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; -pub use prometheus::{Encoder, TextEncoder}; +pub use prometheus::{ + Counter, CounterVec, Encoder, Error, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterVec, IntGauge, IntGaugeVec, TextEncoder, core, default_registry, exponential_buckets, + linear_buckets, opts, proto, register, register_counter_vec, register_gauge, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, +}; pub mod launch_timestamp; mod wrappers; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 79da05da6c..87dfdfb5ec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 1aff5a7012..039cc1319e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -9,19 +9,18 @@ pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use std::collections::HashMap; +use std::num::{NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::Duration; + use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; -use std::{ - collections::HashMap, - num::{NonZeroU64, NonZeroUsize}, - str::FromStr, - time::Duration, -}; -use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol}; +use utils::logging::LogFormat; +use utils::postgres_client::PostgresClientProtocol; -use crate::models::ImageCompressionAlgorithm; -use crate::models::LsnLease; +use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver @@ -367,10 +366,10 @@ pub struct TenantConfigToml { } pub mod defaults { - use crate::models::ImageCompressionAlgorithm; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + use crate::models::ImageCompressionAlgorithm; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -584,7 +583,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; - pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index f94bfab581..2cfe1a85f9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -9,11 +9,8 @@ use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; -use crate::models::PageserverUtilization; -use crate::{ - models::{ShardParameters, TenantConfig}, - shard::{ShardStripeSize, TenantShardId}, -}; +use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; +use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -354,7 +351,7 @@ impl FromStr for SkSchedulingPolicy { _ => { return Err(anyhow::anyhow!( "Unknown scheduling policy '{s}', try active,pause,decomissioned" - )) + )); } }) } @@ -457,9 +454,10 @@ pub struct SafekeeperSchedulingPolicyRequest { #[cfg(test)] mod test { - use super::*; use serde_json; + use super::*; + /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index b88a2e46a1..8836e7ec87 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,11 +1,12 @@ -use anyhow::{bail, Result}; -use byteorder::{ByteOrder, BE}; +use std::fmt; +use std::ops::Range; + +use anyhow::{Result, bail}; +use byteorder::{BE, ByteOrder}; use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::Oid; -use postgres_ffi::RepOriginId; +use postgres_ffi::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; -use std::{fmt, ops::Range}; use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -954,25 +955,22 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; - use crate::key::is_metadata_key_slice; - use crate::key::Key; - - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use super::AUX_KEY_PREFIX; + use crate::key::{Key, is_metadata_key_slice}; #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { - field1: rng.gen(), - field2: rng.gen(), - field3: rng.gen(), - field4: rng.gen(), - field5: rng.gen(), - field6: rng.gen(), + field1: rng.r#gen(), + field2: rng.r#gen(), + field3: rng.r#gen(), + field4: rng.r#gen(), + field5: rng.r#gen(), + field6: rng.r#gen(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c55b9e9484..e505f23e49 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,10 @@ -use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::{ - key::Key, - shard::{ShardCount, ShardIdentity}, -}; use itertools::Itertools; +use postgres_ffi::BLCKSZ; + +use crate::key::Key; +use crate::shard::{ShardCount, ShardIdentity}; /// /// Represents a set of Keys, in a compact form. @@ -609,15 +608,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use std::fmt::Write; + use rand::{RngCore, SeedableRng}; - use crate::{ - models::ShardParameters, - shard::{ShardCount, ShardNumber}, - }; - use super::*; - use std::fmt::Write; + use crate::models::ShardParameters; + use crate::shard::{ShardCount, ShardNumber}; // Helper function to create a key range. // diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 1164048229..ea565e7769 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,38 +2,30 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; -#[cfg(feature = "testing")] -use camino::Utf8PathBuf; -pub use utilization::PageserverUtilization; - use core::ops::Range; -use std::{ - collections::HashMap, - fmt::Display, - io::{BufRead, Read}, - num::{NonZeroU32, NonZeroU64, NonZeroUsize}, - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::fmt::Display; +use std::io::{BufRead, Read}; +use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; use postgres_ffi::BLCKSZ; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; -use utils::{ - completion, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, - postgres_client::PostgresClientProtocol, - serde_system_time, -}; +pub use utilization::PageserverUtilization; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::PostgresClientProtocol; +use utils::{completion, serde_system_time}; -use crate::{ - key::{CompactKey, Key}, - reltag::RelTag, - shard::{ShardCount, ShardStripeSize, TenantShardId}, -}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crate::key::{CompactKey, Key}; +use crate::reltag::RelTag; +use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -332,7 +324,8 @@ pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { - use rand::{distributions::Alphanumeric, Rng}; + use rand::Rng; + use rand::distributions::Alphanumeric; Self( rand::thread_rng() .sample_iter(&Alphanumeric) @@ -2288,9 +2281,10 @@ impl Default for PageTraceEvent { #[cfg(test)] mod tests { - use serde_json::json; use std::str::FromStr; + use serde_json::json; + use super::*; #[test] diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 641aa51989..69c240ff3c 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,5 +1,7 @@ use std::time::SystemTime; -use utils::{serde_percent::Percent, serde_system_time}; + +use utils::serde_percent::Percent; +use utils::serde_system_time; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -131,12 +133,12 @@ impl PageserverUtilization { /// Test helper pub mod test_utilization { - use super::PageserverUtilization; use std::time::SystemTime; - use utils::{ - serde_percent::Percent, - serde_system_time::{self}, - }; + + use utils::serde_percent::Percent; + use utils::serde_system_time::{self}; + + use super::PageserverUtilization; // Parameters of the imaginary node used for test utilization instances const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index bb62b35d36..fda504a26e 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -1,7 +1,7 @@ //! This module defines the WAL record format used within the pageserver. use bytes::Bytes; -use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember}; +use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 09d1fae221..473a44dbf9 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -1,10 +1,10 @@ -use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; +use serde::{Deserialize, Serialize}; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e03df02afb..eca04b1f3d 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -33,12 +33,13 @@ use std::hash::{Hash, Hasher}; -use crate::{key::Key, models::ShardParameters}; +#[doc(inline)] +pub use ::utils::shard::*; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -#[doc(inline)] -pub use ::utils::shard::*; +use crate::key::Key; +use crate::models::ShardParameters; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -337,7 +338,8 @@ pub fn describe( mod tests { use std::str::FromStr; - use utils::{id::TenantId, Hex}; + use utils::Hex; + use utils::id::TenantId; use super::*; diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 2e88836bd0..647d01c3c2 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{ - controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, -}; +use crate::controller_api::NodeRegisterRequest; +use crate::models::LocationConfigMode; +use crate::shard::TenantShardId; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -30,7 +30,7 @@ fn default_mode() -> LocationConfigMode { pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode - pub gen: Option, + pub r#gen: Option, /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] @@ -44,7 +44,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { pub id: TenantShardId, - pub gen: u32, + pub r#gen: u32, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 1f8ed30a9a..883d903ff3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -7,10 +7,11 @@ //! Note that the [`Value`] type is used for the permananent storage format, so any //! changes to it must be backwards compatible. -use crate::record::NeonWalRecord; use bytes::Bytes; use serde::{Deserialize, Serialize}; +use crate::record::NeonWalRecord; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value @@ -83,11 +84,11 @@ impl ValueBytes { #[cfg(test)] mod test { - use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; + use super::*; + macro_rules! roundtrip { ($orig:expr, $expected:expr) => {{ let orig: Value = $orig; diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f74b229ac4..a0a891f0dc 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -4,28 +4,28 @@ //! is rather narrow, but we can extend it once required. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::Context; -use bytes::Bytes; -use serde::{Deserialize, Serialize}; +use std::future::Future; use std::io::ErrorKind; use std::net::SocketAddr; -use std::os::fd::AsRawFd; -use std::os::fd::RawFd; +use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; +use std::str::FromStr; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; use std::{fmt, io}; -use std::{future::Future, str::FromStr}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_rustls::TlsAcceptor; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn}; +use anyhow::Context; +use bytes::Bytes; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN, SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION, }; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, trace, warn}; /// An error, occurred during query processing: /// either during the connection ([`ConnectionError`]) or before/after it. @@ -746,7 +746,7 @@ impl PostgresBackend { match e { QueryError::Shutdown => return Ok(ProcessMsgResult::Break), QueryError::SimulatedConnectionError => { - return Err(QueryError::SimulatedConnectionError) + return Err(QueryError::SimulatedConnectionError); } err @ QueryError::Reconnect => { // Instruct the client to reconnect, stop processing messages @@ -1020,7 +1020,9 @@ fn log_query_error(query: &str, e: &QueryError) { } } QueryError::Disconnected(other_connection_error) => { - error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + error!( + "query handler for '{query}' failed with connection error: {other_connection_error:?}" + ) } QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 3fcfbf4a03..907ef9eed3 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -1,10 +1,11 @@ +use std::io::Cursor; +use std::sync::Arc; + /// Test postgres_backend_async with tokio_postgres use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use rustls::crypto::ring; -use std::io::Cursor; -use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index e3d31c6cfc..cd981b3729 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -1,9 +1,10 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::{bail, Context}; -use itertools::Itertools; use std::borrow::Cow; use std::fmt; + +use anyhow::{Context, bail}; +use itertools::Itertools; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. @@ -29,9 +30,10 @@ pub fn parse_host_port>(host_port: S) -> Result<(Host, Option #[cfg(test)] mod tests_parse_host_port { - use crate::parse_host_port; use url::Host; + use crate::parse_host_port; + #[test] fn test_normal() { let (host, port) = parse_host_port("hello:123").unwrap(); @@ -207,10 +209,11 @@ impl fmt::Debug for PgConnectionConfig { #[cfg(test)] mod tests_pg_connection_config { - use crate::PgConnectionConfig; use once_cell::sync::Lazy; use url::Host; + use crate::PgConnectionConfig; + static STUB_HOST: Lazy = Lazy::new(|| Host::Domain("stub.host.example".to_owned())); #[test] diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs index c8cf0d322a..2e1d62e452 100644 --- a/libs/postgres_ffi/benches/waldecoder.rs +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -1,6 +1,6 @@ use std::ffi::CStr; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index d3a85f2683..cdebd43f6f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,7 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 301bc2f16e..8dfd8d8750 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -21,7 +21,9 @@ macro_rules! postgres_ffi { pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] + #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::undocumented_unsafe_blocks)] + #![allow(clippy::ptr_offset_with_cast)] use serde::{Deserialize, Serialize}; include!(concat!( @@ -43,8 +45,7 @@ macro_rules! postgres_ffi { pub const PG_MAJORVERSION: &str = stringify!($version); // Re-export some symbols from bindings - pub use bindings::DBState_DB_SHUTDOWNED; - pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + pub use bindings::{CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, XLogRecord}; pub const ZERO_CHECKPOINT: bytes::Bytes = bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); @@ -221,21 +222,17 @@ pub mod relfile_utils; pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions -pub use v14::bindings::RepOriginId; -pub use v14::bindings::{uint32, uint64, Oid}; -pub use v14::bindings::{BlockNumber, OffsetNumber}; -pub use v14::bindings::{MultiXactId, TransactionId}; -pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; - +pub use v14::bindings::{ + BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData, + RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, + uint64, +}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; -pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{ XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; -pub use v14::bindings::{CheckPoint, ControlFileData}; - // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -246,13 +243,11 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod -pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::get_current_timestamp; -pub use v14::xlog_utils::to_pg_timestamp; -pub use v14::xlog_utils::try_from_pg_timestamp; -pub use v14::xlog_utils::XLogFileName; - pub use v14::bindings::DBState_DB_SHUTDOWNED; +pub use v14::xlog_utils::{ + XLogFileName, encode_logical_message, get_current_timestamp, to_pg_timestamp, + try_from_pg_timestamp, +}; pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) @@ -355,8 +350,9 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { } pub mod waldecoder { - use bytes::{Buf, Bytes, BytesMut}; use std::num::NonZeroU32; + + use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index e343473d77..b0bdd8a8da 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -9,8 +9,7 @@ //! comments on them. //! -use crate::PageHeaderData; -use crate::BLCKSZ; +use crate::{BLCKSZ, PageHeaderData}; // // From pg_tablespace_d.h diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index fce37e2fdd..1ccf4590a9 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -3,18 +3,16 @@ //! //! TODO: Generate separate types for each supported PG version -use crate::pg_constants; -use crate::XLogRecord; -use crate::{ - BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz, - TransactionId, -}; -use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD}; use bytes::{Buf, Bytes}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; +use crate::{ + BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, + TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, +}; + #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { @@ -508,9 +506,10 @@ pub fn decode_wal_record( } pub mod v14 { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlHeapInsert { @@ -678,9 +677,10 @@ pub mod v15 { } pub mod v16 { + use bytes::{Buf, Bytes}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use crate::{OffsetNumber, TransactionId}; - use bytes::{Buf, Bytes}; pub struct XlHeapDelete { pub xmax: TransactionId, @@ -746,9 +746,10 @@ pub mod v16 { /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ pub mod rm_neon { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapInsert { @@ -858,14 +859,14 @@ pub mod v16 { } pub mod v17 { - pub use super::v14::XlHeapLockUpdated; - pub use crate::{TimeLineID, TimestampTz}; use bytes::{Buf, Bytes}; - pub use super::v16::rm_neon; + pub use super::v14::XlHeapLockUpdated; pub use super::v16::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + rm_neon, }; + pub use crate::{TimeLineID, TimestampTz}; #[repr(C)] #[derive(Debug)] diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 41afcea6c2..6151ce34ac 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,7 +1,9 @@ +use std::path::PathBuf; +use std::str::FromStr; + use anyhow::*; -use clap::{value_parser, Arg, ArgMatches, Command}; +use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; -use std::{path::PathBuf, str::FromStr}; use wal_craft::*; fn main() -> Result<()> { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 77dff4ac99..ca9530faef 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,17 +1,18 @@ -use anyhow::{bail, ensure}; -use camino_tempfile::{tempdir, Utf8TempDir}; -use log::*; -use postgres::types::PgLsn; -use postgres::Client; -use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{ - XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, -}; use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; +use anyhow::{bail, ensure}; +use camino_tempfile::{Utf8TempDir, tempdir}; +use log::*; +use postgres::Client; +use postgres::types::PgLsn; +use postgres_ffi::{ + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, + XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; + macro_rules! xlog_utils_test { ($version:ident) => { #[path = "."] diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index ccbb90e384..8e216d0f44 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -10,11 +10,10 @@ //! calls. //! //! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 +use std::future::Future; +use std::io::{self, ErrorKind}; + use bytes::{Buf, BytesMut}; -use std::{ - future::Future, - io::{self, ErrorKind}, -}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index f99128b76a..e435ffbf7e 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -5,14 +5,15 @@ pub mod framed; +use std::borrow::Cow; +use std::{fmt, io, str}; + use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use std::{borrow::Cow, fmt, io, str}; - // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; +use serde::{Deserialize, Serialize}; pub type Oid = u32; pub type SystemId = u64; @@ -206,8 +207,8 @@ use rand::distributions::{Distribution, Standard}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> CancelKeyData { CancelKeyData { - backend_pid: rng.gen(), - cancel_key: rng.gen(), + backend_pid: rng.r#gen(), + cancel_key: rng.r#gen(), } } } @@ -1035,7 +1036,7 @@ impl BeMessage<'_> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol - // dependency + // dependency buf.put_u64(rec.streaming_lsn); buf.put_u64(rec.commit_lsn); buf.put_slice(rec.data); diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 33fa6e89f5..7bdf340f74 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "remote_storage" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 9027a8bf55..dee61a410d 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -2,33 +2,26 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::env; use std::fmt::Display; -use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; +use std::{env, io}; -use super::REMOTE_STORAGE_PREFIX_SEPARATOR; -use anyhow::Context; -use anyhow::Result; +use anyhow::{Context, Result}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; -use azure_core::HttpClient; -use azure_core::TransportOptions; -use azure_core::{Continuable, RetryOptions}; +use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; -use azure_storage_blobs::prelude::ClientBuilder; -use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use bytes::Bytes; +use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; -use futures::FutureExt; -use futures_util::StreamExt; -use futures_util::TryStreamExt; +use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -36,12 +29,13 @@ use tracing::debug; use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; -use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; -use crate::DownloadKind; +use super::REMOTE_STORAGE_PREFIX_SEPARATOR; +use crate::config::AzureConfig; +use crate::error::Cancelled; +use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ - config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, - DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, - TimeTravelError, TimeoutOrCancel, + ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, + ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index ff34158c9c..52978be5b4 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,8 +1,10 @@ -use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; +use std::fmt::Debug; +use std::num::NonZeroUsize; +use std::str::FromStr; +use std::time::Duration; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; - use serde::{Deserialize, Serialize}; use crate::{ diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 69b522d63e..6eb5570d9b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -18,40 +18,35 @@ mod s3_bucket; mod simulate_failures; mod support; -use std::{ - collections::HashMap, - fmt::Debug, - num::NonZeroU32, - ops::Bound, - pin::{pin, Pin}, - sync::Arc, - time::SystemTime, -}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::pin::{Pin, pin}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; - +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; use bytes::Bytes; -use futures::{stream::Stream, StreamExt}; +use camino::{Utf8Path, Utf8PathBuf}; +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +use futures::StreamExt; +use futures::stream::Stream; use itertools::Itertools as _; +use s3_bucket::RequestKind; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::info; -pub use self::{ - azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket, - simulate_failures::UnreliableWrapper, -}; -use s3_bucket::RequestKind; - +pub use self::azure_blob::AzureBlobStorage; +pub use self::local_fs::LocalFs; +pub use self::s3_bucket::S3Bucket; +pub use self::simulate_failures::UnreliableWrapper; pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; -/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. -pub use azure_core::Etag; - -pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; - /// Default concurrency limit for S3 operations /// /// Currently, sync happens with AWS S3, that has two limits on requests per second: @@ -640,8 +635,13 @@ impl GenericRemoteStorage { let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + info!( + "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", + s3_config.bucket_name, + s3_config.bucket_region, + s3_config.prefix_in_bucket, + s3_config.endpoint + ); Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { @@ -649,8 +649,12 @@ impl GenericRemoteStorage { .storage_account .as_deref() .unwrap_or(""); - info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", - azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); + info!( + "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", + azure_config.container_name, + azure_config.container_region, + azure_config.prefix_in_container + ); Self::AzureBlob(Arc::new(AzureBlobStorage::new( azure_config, timeout, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index a8b00173ba..f03d6ac8ee 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,31 +4,26 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{ - collections::HashSet, - io::ErrorKind, - num::NonZeroU32, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; +use std::collections::HashSet; +use std::io::ErrorKind; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::Stream; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, -}; -use tokio_util::{io::ReaderStream, sync::CancellationToken}; +use tokio::fs; +use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::ReaderStream; +use tokio_util::sync::CancellationToken; use utils::crashsafe::path_with_suffix_extension; -use crate::{ - Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, - TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - use super::{RemoteStorage, StorageMetadata}; -use crate::Etag; +use crate::{ + Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject, + REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel, +}; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; @@ -91,7 +86,8 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - use std::{future::Future, pin::Pin}; + use std::future::Future; + use std::pin::Pin; fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> @@ -284,7 +280,9 @@ impl LocalFs { })?; if bytes_read < from_size_bytes { - bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + bail!( + "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes" + ); } // Check if there is any extra data after the given size. let mut from = buffer_to_read.into_inner(); @@ -642,10 +640,13 @@ fn mock_etag(meta: &std::fs::Metadata) -> Etag { #[cfg(test)] mod fs_tests { - use super::*; + use std::collections::HashMap; + use std::io::Write; + use std::ops::Bound; use camino_tempfile::tempdir; - use std::{collections::HashMap, io::Write, ops::Bound}; + + use super::*; async fn read_and_check_metadata( storage: &LocalFs, @@ -736,9 +737,14 @@ mod fs_tests { ); let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; - match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await { + match storage + .download(&non_existing_path, &DownloadOpts::default(), &cancel) + .await + { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys - other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), + other => panic!( + "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}" + ), } Ok(()) } diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index 48c121fbc8..81e68e9a29 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -1,5 +1,5 @@ use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter, + Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec, }; use once_cell::sync::Lazy; @@ -16,8 +16,8 @@ pub(crate) enum RequestKind { Head = 6, } -use scopeguard::ScopeGuard; use RequestKind::*; +use scopeguard::ScopeGuard; impl RequestKind { const fn as_str(&self) -> &'static str { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d3f19f0b11..ba7ce9e1e7 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,56 +4,50 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{ - borrow::Cow, - collections::HashMap, - num::NonZeroU32, - pin::Pin, - sync::Arc, - task::{Context, Poll}, - time::{Duration, SystemTime}, -}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context as _}; -use aws_config::{ - default_provider::credentials::DefaultCredentialsChain, - retry::{RetryConfigBuilder, RetryMode}, - BehaviorVersion, -}; -use aws_sdk_s3::{ - config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, - error::SdkError, - operation::{get_object::GetObjectError, head_object::HeadObjectError}, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, - Client, -}; +use anyhow::{Context as _, anyhow}; +use aws_config::BehaviorVersion; +use aws_config::default_provider::credentials::DefaultCredentialsChain; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; +use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}; +use aws_sdk_s3::error::SdkError; +use aws_sdk_s3::operation::get_object::GetObjectError; +use aws_sdk_s3::operation::head_object::HeadObjectError; +use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}; use aws_smithy_async::rt::sleep::TokioSleep; -use http_body_util::StreamBody; -use http_types::StatusCode; - -use aws_smithy_types::{body::SdkBody, DateTime}; -use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; +use aws_smithy_types::DateTime; +use aws_smithy_types::body::SdkBody; +use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::date_time::ConversionError; use bytes::Bytes; use futures::stream::Stream; use futures_util::StreamExt; +use http_body_util::StreamBody; +use http_types::StatusCode; use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; use super::StorageMetadata; -use crate::{ - config::S3Config, - error::Cancelled, - metrics::{start_counting_cancelled_wait, start_measuring_requests}, - support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, - RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3, - REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - -use crate::metrics::AttemptOutcome; +use crate::config::S3Config; +use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; +use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; +use crate::support::PermitCarrying; +use crate::{ + ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, + MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, + TimeTravelError, TimeoutOrCancel, +}; /// AWS S3 storage. pub struct S3Bucket { @@ -958,8 +952,10 @@ impl RemoteStorage for S3Bucket { version_id, key, .. } = &vd; if version_id == "null" { - return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"))); + return Err(TimeTravelError::Other(anyhow!( + "Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values" + ))); } tracing::trace!( "Parsing version key={key} version_id={version_id} kind={:?}", @@ -1126,9 +1122,10 @@ impl VerOrDelete { #[cfg(test)] mod tests { - use camino::Utf8Path; use std::num::NonZeroUsize; + use camino::Utf8Path; + use crate::{RemotePath, S3Bucket, S3Config}; #[tokio::test] diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 63c24beb51..f56be873c4 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,14 +1,15 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. -use bytes::Bytes; -use futures::stream::Stream; -use futures::StreamExt; use std::collections::HashMap; +use std::collections::hash_map::Entry; use std::num::NonZeroU32; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::SystemTime; -use std::{collections::hash_map::Entry, sync::Arc}; + +use bytes::Bytes; +use futures::StreamExt; +use futures::stream::Stream; use tokio_util::sync::CancellationToken; use crate::{ diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs index 1ed9ed9305..07da38cf77 100644 --- a/libs/remote_storage/src/support.rs +++ b/libs/remote_storage/src/support.rs @@ -1,9 +1,7 @@ -use std::{ - future::Future, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Duration; use bytes::Bytes; use futures_util::Stream; @@ -114,9 +112,10 @@ pub(crate) fn cancel_or_timeout( #[cfg(test)] mod tests { + use futures::stream::StreamExt; + use super::*; use crate::DownloadError; - use futures::stream::StreamExt; #[tokio::test(start_paused = true)] async fn cancelled_download_stream() { diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index d5da1d48e9..6a78ddc01e 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,19 +1,20 @@ +use std::collections::HashSet; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::sync::Arc; + use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; -use std::ops::Bound; -use std::sync::Arc; -use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; use tokio_util::sync::CancellationToken; use tracing::debug; -use crate::common::{download_to_vec, upload_stream, wrap_stream}; - use super::{ MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, }; +use crate::common::{download_to_vec, upload_stream, wrap_stream}; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. @@ -62,7 +63,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .into_iter() .collect::>(); assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), + root_remote_prefixes, + HashSet::from([base_prefix.clone()]), "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" ); @@ -84,7 +86,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); @@ -119,7 +122,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes_combined) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 15004dbf83..31c9ca3200 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ +use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; -use std::{collections::HashSet, time::Duration}; +use std::time::{Duration, UNIX_EPOCH}; use anyhow::Context; use remote_storage::{ @@ -208,7 +208,7 @@ async fn create_azure_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index e60ec18c93..6996bb27ae 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; use std::env; use std::fmt::{Debug, Display}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; -use std::{collections::HashSet, time::SystemTime}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; @@ -15,12 +14,13 @@ use remote_storage::{ DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; -use test_context::test_context; -use test_context::AsyncTestContext; +use test_context::{AsyncTestContext, test_context}; use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; +use crate::common::{download_to_vec, upload_stream}; + mod common; #[path = "common/tests.rs"] @@ -128,8 +128,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { - panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ - This likely means a large lock discrepancy between S3 and the local clock."); + panic!( + "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock." + ); } } @@ -383,7 +385,7 @@ async fn create_s3_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 6b72ace019..d9d080e8fe 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 8b14a4f290..bb8934744a 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -2,7 +2,8 @@ //! rfcs/035-safekeeper-dynamic-membership-change.md //! for details. -use std::{collections::HashSet, fmt::Display}; +use std::collections::HashSet; +use std::fmt::Display; use anyhow; use anyhow::bail; @@ -68,14 +69,12 @@ impl Display for SafekeeperId { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(transparent)] pub struct MemberSet { - pub members: Vec, + pub m: Vec, } impl MemberSet { pub fn empty() -> Self { - MemberSet { - members: Vec::new(), - } + MemberSet { m: Vec::new() } } pub fn new(members: Vec) -> anyhow::Result { @@ -83,21 +82,21 @@ impl MemberSet { if hs.len() != members.len() { bail!("duplicate safekeeper id in the set {:?}", members); } - Ok(MemberSet { members }) + Ok(MemberSet { m: members }) } - pub fn contains(&self, sk: &SafekeeperId) -> bool { - self.members.iter().any(|m| m.id == sk.id) + pub fn contains(&self, sk: NodeId) -> bool { + self.m.iter().any(|m| m.id == sk) } pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { - if self.contains(&sk) { + if self.contains(sk.id) { bail!(format!( "sk {} is already member of the set {}", sk.id, self )); } - self.members.push(sk); + self.m.push(sk); Ok(()) } } @@ -105,11 +104,7 @@ impl MemberSet { impl Display for MemberSet { /// Display as a comma separated list of members. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let sks_str = self - .members - .iter() - .map(|m| m.to_string()) - .collect::>(); + let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::>(); write!(f, "({})", sks_str.join(", ")) } } @@ -135,6 +130,11 @@ impl Configuration { new_members: None, } } + + /// Is `sk_id` member of the configuration? + pub fn contains(&self, sk_id: NodeId) -> bool { + self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id)) + } } impl Display for Configuration { @@ -154,9 +154,10 @@ impl Display for Configuration { #[cfg(test)] mod tests { - use super::{MemberSet, SafekeeperId}; use utils::id::NodeId; + use super::{MemberSet, SafekeeperId}; + #[test] fn test_member_set() { let mut members = MemberSet::empty(); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 41ccdaa428..2f2aeaa429 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,18 +1,17 @@ //! Types used in safekeeper http API. Many of them are also reused internally. +use std::net::SocketAddr; + use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; -use std::net::SocketAddr; use tokio::time::Instant; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - pageserver_feedback::PageserverFeedback, -}; - -use crate::{membership::Configuration, ServerInfo, Term}; +use crate::membership::Configuration; +use crate::{ServerInfo, Term}; #[derive(Debug, Serialize)] pub struct SafekeeperStatus { diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index be00562219..d54876ba2c 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -130,11 +130,7 @@ impl StorageModel { break; } } - if possible { - Some(snapshot_later) - } else { - None - } + if possible { Some(snapshot_later) } else { None } } else { None }; diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index 25ebb1c3d8..a3bc937f52 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -76,7 +76,10 @@ pub fn draw_svg( let mut result = String::new(); - writeln!(result, "")?; + writeln!( + result, + "" + )?; draw.calculate_svg_layout(); diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index 2168beee88..8560d0718c 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -1,8 +1,8 @@ //! Tracing wrapper for Hyper HTTP server -use hyper0::HeaderMap; -use hyper0::{Body, Request, Response}; use std::future::Future; + +use hyper0::{Body, HeaderMap, Request, Response}; use tracing::Instrument; use tracing_opentelemetry::OpenTelemetrySpanExt; diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 818d759eac..72f94d61e4 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -36,11 +36,11 @@ pub mod http; -use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; +use opentelemetry::trace::TracerProvider; use tracing::Subscriber; -use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; +use tracing_subscriber::registry::LookupSpan; /// Set up OpenTelemetry exporter, using configuration from environment variables. /// diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md index e23ec268c2..5afbe3cf2b 100644 --- a/libs/utils/benches/README.md +++ b/libs/utils/benches/README.md @@ -10,14 +10,14 @@ cargo bench --package utils cargo bench --package utils --bench benchmarks # Specific benchmark. -cargo bench --package utils --bench benchmarks warn_slow/enabled=true +cargo bench --package utils --bench benchmarks log_slow/enabled=true # List available benchmarks. cargo bench --package utils --benches -- --list # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. # Output in target/criterion/*/profile/flamegraph.svg. -cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10 +cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index cff3792f3a..12c620ec87 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,16 +1,16 @@ use std::time::Duration; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use pprof::criterion::{Output, PProfProfiler}; use utils::id; -use utils::logging::warn_slow; +use utils::logging::log_slow; // Register benchmarks with Criterion. criterion_group!( name = benches; config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); targets = bench_id_stringify, - bench_warn_slow, + bench_log_slow, ); criterion_main!(benches); @@ -29,9 +29,9 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -pub fn bench_warn_slow(c: &mut Criterion) { +pub fn bench_log_slow(c: &mut Criterion) { for enabled in [false, true] { - c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| { + c.bench_function(&format!("log_slow/enabled={enabled}"), |b| { run_bench(b, enabled).unwrap() }); } @@ -45,11 +45,11 @@ pub fn bench_warn_slow(c: &mut Criterion) { .enable_all() .build()?; - // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling + // Test both with and without log_slow, since we're essentially measuring Tokio scheduling // performance too. Use a simple noop future that yields once, to avoid any scheduler fast // paths for a ready future. if enabled { - b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now()))); + b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now()))); } else { b.iter(|| runtime.block_on(tokio::task::yield_now())); } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 4bfd0ab055..cc5b0b1d13 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,12 +1,15 @@ // For details about authentication see docs/authentication.md -use arc_swap::ArcSwap; -use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; +use std::borrow::Cow; +use std::fmt::Display; +use std::fs; +use std::sync::Arc; use anyhow::Result; +use arc_swap::ArcSwap; use camino::Utf8Path; use jsonwebtoken::{ - decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, + Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, }; use serde::{Deserialize, Serialize}; @@ -129,7 +132,9 @@ impl JwtAuth { anyhow::bail!("path is neither a directory or a file") }; if decoding_keys.is_empty() { - anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected."); + anyhow::bail!( + "Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected." + ); } Ok(Self::new(decoding_keys)) } @@ -175,9 +180,10 @@ pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use super::*; + // Generated with: // // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem @@ -215,7 +221,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims; assert_eq!(claims_from_token, expected_claims); } @@ -230,7 +238,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap(); // decode it back - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let decoded = auth.decode(&encoded).unwrap(); assert_eq!(decoded.claims, claims); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index e6503fe377..4a4c4eedbb 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -121,10 +121,12 @@ where #[cfg(test)] mod tests { - use super::*; use std::io; + use tokio::sync::Mutex; + use super::*; + #[test] fn backoff_defaults_produce_growing_backoff_sequence() { let mut current_backoff_value = None; diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 4d173d0726..2861baeee5 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -13,9 +13,11 @@ #![warn(missing_docs)] -use bincode::Options; -use serde::{de::DeserializeOwned, Serialize}; use std::io::{self, Read, Write}; + +use bincode::Options; +use serde::Serialize; +use serde::de::DeserializeOwned; use thiserror::Error; /// An error that occurred during a deserialize operation @@ -261,10 +263,12 @@ impl LeSer for T {} #[cfg(test)] mod tests { - use super::DeserializeError; - use serde::{Deserialize, Serialize}; use std::io::Cursor; + use serde::{Deserialize, Serialize}; + + use super::DeserializeError; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs index e1ddfd8650..46a6584d66 100644 --- a/libs/utils/src/circuit_breaker.rs +++ b/libs/utils/src/circuit_breaker.rs @@ -1,7 +1,5 @@ -use std::{ - fmt::Display, - time::{Duration, Instant}, -}; +use std::fmt::Display; +use std::time::{Duration, Instant}; use metrics::IntCounter; diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index f65c080ad4..973d754715 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -1,4 +1,5 @@ -use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; +use tokio_util::task::TaskTracker; +use tokio_util::task::task_tracker::TaskTrackerToken; /// While a reference is kept around, the associated [`Barrier::wait`] will wait. /// diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 5241ab183c..290a5b2686 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,9 +1,7 @@ +use std::borrow::Cow; +use std::fs::{self, File}; +use std::io::{self, Write}; use std::os::fd::AsRawFd; -use std::{ - borrow::Cow, - fs::{self, File}, - io::{self, Write}, -}; use camino::{Utf8Path, Utf8PathBuf}; diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index a1bcec9229..2a85f54a01 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -1,6 +1,7 @@ //! Wrapper around `std::env::var` for parsing environment variables. -use std::{fmt::Display, str::FromStr}; +use std::fmt::Display; +use std::str::FromStr; /// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index fc998ad9a9..ce014eb0ac 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -127,6 +127,9 @@ pub async fn failpoint_sleep_cancellable_helper( tracing::info!("failpoint {:?}: sleep done", name); } +/// Initialize the configured failpoints +/// +/// You must call this function before any concurrent threads do operations. pub fn init() -> fail::FailScenario<'static> { // The failpoints lib provides support for parsing the `FAILPOINTS` env var. // We want non-default behavior for `exit`, though, so, we handle it separately. @@ -134,7 +137,10 @@ pub fn init() -> fail::FailScenario<'static> { // Format for FAILPOINTS is "name=actions" separated by ";". let actions = std::env::var("FAILPOINTS"); if actions.is_ok() { - std::env::remove_var("FAILPOINTS"); + // SAFETY: this function should before any threads start and access env vars concurrently + unsafe { + std::env::remove_var("FAILPOINTS"); + } } else { // let the library handle non-utf8, or nothing for not present } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 8e53d2c79b..a406ab0378 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -58,9 +58,8 @@ where #[cfg(test)] mod test { - use crate::fs_ext::{is_directory_empty, list_dir}; - use super::ignore_absent_files; + use crate::fs_ext::{is_directory_empty, list_dir}; #[test] fn is_empty_dir() { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index 897e30d7f1..fc6f794b57 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -38,7 +38,8 @@ pub fn rename_noreplace( #[cfg(test)] mod test { - use std::{fs, path::PathBuf}; + use std::fs; + use std::path::PathBuf; use super::*; diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 44565ee6a2..b5e4a4644a 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -169,9 +169,9 @@ mod test { ]; let mut s = String::new(); - for (line, gen, expected) in examples { + for (line, gen_, expected) in examples { s.clear(); - write!(s, "{}", &gen.get_suffix()).expect("string grows"); + write!(s, "{}", &gen_.get_suffix()).expect("string grows"); assert_eq!(s, expected, "example on {line}"); } } diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs index cec5202460..26cd640d3b 100644 --- a/libs/utils/src/guard_arc_swap.rs +++ b/libs/utils/src/guard_arc_swap.rs @@ -1,8 +1,9 @@ //! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes //! don't block reads. -use arc_swap::ArcSwap; use std::sync::Arc; + +use arc_swap::ArcSwap; use tokio::sync::TryLockError; pub struct GuardArcSwap { diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index eb91839504..6016c23a01 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,5 +1,6 @@ +use std::fmt; use std::num::ParseIntError; -use std::{fmt, str::FromStr}; +use std::str::FromStr; use anyhow::Context; use hex::FromHex; @@ -215,7 +216,7 @@ macro_rules! id_newtype { impl AsRef<[u8]> for $t { fn as_ref(&self) -> &[u8] { - &self.0 .0 + &self.0.0 } } @@ -367,9 +368,8 @@ impl FromStr for NodeId { mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; - use crate::bin_ser::BeSer; - use super::*; + use crate::bin_ser::BeSer; #[test] fn test_id_serde_non_human_readable() { diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs index 0cc58738c0..2398f92766 100644 --- a/libs/utils/src/leaky_bucket.rs +++ b/libs/utils/src/leaky_bucket.rs @@ -21,15 +21,12 @@ //! //! Another explaination can be found here: -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Mutex, - }, - time::Duration, -}; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Duration; -use tokio::{sync::Notify, time::Instant}; +use tokio::sync::Notify; +use tokio::time::Instant; pub struct LeakyBucketConfig { /// This is the "time cost" of a single request unit. diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs index 5ae0e86af8..766529838c 100644 --- a/libs/utils/src/linux_socket_ioctl.rs +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -2,21 +2,23 @@ //! //! -use std::{ - io, - mem::MaybeUninit, - os::{fd::RawFd, raw::c_int}, -}; +use std::io; +use std::mem::MaybeUninit; +use std::os::fd::RawFd; +use std::os::raw::c_int; use nix::libc::{FIONREAD, TIOCOUTQ}; unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { let mut inq: MaybeUninit = MaybeUninit::uninit(); - let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); - if err == 0 { - Ok(inq.assume_init()) - } else { - Err(io::Error::last_os_error()) + // SAFETY: encapsulating fn is unsafe, we require `socket_fd` to be a valid file descriptor + unsafe { + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } } } @@ -24,12 +26,14 @@ unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn inq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, FIONREAD) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, FIONREAD) } } /// # Safety /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn outq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, TIOCOUTQ) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, TIOCOUTQ) } } diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 3a2ed3e830..6aeeeca021 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -6,16 +6,15 @@ //! there for potential pitfalls with lock files that are used //! to store PIDs (pidfiles). -use std::{ - fs, - io::{Read, Write}, - ops::Deref, - os::unix::prelude::AsRawFd, -}; +use std::fs; +use std::io::{Read, Write}; +use std::ops::Deref; +use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use nix::{errno::Errno::EAGAIN, fcntl}; +use nix::errno::Errno::EAGAIN; +use nix::fcntl; use crate::crashsafe; diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 95c69ac8ba..881f1e765d 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -7,7 +7,7 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; use tokio::time::Instant; -use tracing::warn; +use tracing::info; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -273,7 +273,9 @@ fn log_panic_to_stderr( location: Option>, backtrace: &std::backtrace::Backtrace, ) { - eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); + eprintln!( + "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}" + ); } struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); @@ -322,11 +324,13 @@ impl std::fmt::Debug for SecretString { } } -/// Logs a periodic warning if a future is slow to complete. +/// Logs a periodic message if a future is slow to complete. /// /// This is performance-sensitive as it's used on the GetPage read path. +/// +/// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] -pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future) -> O { +pub async fn log_slow(name: &str, threshold: Duration, f: impl Future) -> O { // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and // won't fit on the stack. let mut f = Box::pin(f); @@ -345,13 +349,13 @@ pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future= threshold { - warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); + info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); } return output; } let elapsed = started.elapsed().as_secs_f64(); - warn!("slow {name} still running after {elapsed:.3}s",); + info!("slow {name} still running after {elapsed:.3}s",); attempt += 1; } @@ -359,7 +363,8 @@ pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future= num { - Ok(()) - } else { - Err(cnt) - } + if cnt >= num { Ok(()) } else { Err(cnt) } } /// Register and return a channel that will be notified when a number arrives, @@ -325,9 +322,10 @@ where #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { assert!(*self <= val); diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 36e874a161..ca1e7aa25b 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -12,11 +12,7 @@ pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { pub const fn new(pct: u8) -> Option { - if pct <= 100 { - Some(Percent(pct)) - } else { - None - } + if pct <= 100 { Some(Percent(pct)) } else { None } } pub fn get(&self) -> u8 { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index d98284f969..c8c410a725 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -1,6 +1,7 @@ //! See `pageserver_api::shard` for description on sharding. -use std::{ops::RangeInclusive, str::FromStr}; +use std::ops::RangeInclusive; +use std::str::FromStr; use hex::FromHex; use serde::{Deserialize, Serialize}; @@ -59,11 +60,7 @@ impl ShardCount { /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). pub fn count(&self) -> u8 { - if self.0 > 0 { - self.0 - } else { - 1 - } + if self.0 > 0 { self.0 } else { 1 } } /// The literal internal value: this is **not** the number of shards in the diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index c37e9aea58..f2be1957c4 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,7 +1,7 @@ +pub use signal_hook::consts::TERM_SIGNALS; +pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; -pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; - pub enum Signal { Quit, Interrupt, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 6700f86e4a..fabdf9df46 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -44,8 +44,7 @@ #![warn(missing_docs)] use std::ops::Deref; -use std::sync::{Arc, Weak}; -use std::sync::{RwLock, RwLockWriteGuard}; +use std::sync::{Arc, RwLock, RwLockWriteGuard, Weak}; use tokio::sync::watch; @@ -219,10 +218,11 @@ impl RcuWaitList { #[cfg(test)] mod tests { - use super::*; use std::sync::Mutex; use std::time::Duration; + use super::*; + #[tokio::test] async fn two_writers() { let rcu = Rcu::new(1); diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 0a1ed81621..93460785bf 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 66c2065554..8f8401b35d 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -1,7 +1,6 @@ -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, MutexGuard, -}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; + use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of @@ -301,14 +300,13 @@ impl Drop for InitPermit { #[cfg(test)] mod tests { + use std::convert::Infallible; + use std::pin::{Pin, pin}; + use std::time::Duration; + use futures::Future; use super::*; - use std::{ - convert::Infallible, - pin::{pin, Pin}, - time::Duration, - }; #[tokio::test] async fn many_initializers() { diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index 0cab291d51..7dfbf40411 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -1,4 +1,5 @@ -use core::{future::poll_fn, task::Poll}; +use core::future::poll_fn; +use core::task::Poll; use std::sync::{Arc, Mutex}; use diatomic_waker::DiatomicWaker; diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs index 6b35d3d63a..6a4a77127d 100644 --- a/libs/utils/src/tcp_listener.rs +++ b/libs/utils/src/tcp_listener.rs @@ -1,9 +1,8 @@ -use std::{ - io, - net::{TcpListener, ToSocketAddrs}, -}; +use std::io; +use std::net::{TcpListener, ToSocketAddrs}; -use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; +use nix::sys::socket::setsockopt; +use nix::sys::socket::sockopt::ReuseAddr; /// Bind a [`TcpListener`] to addr with `SO_REUSEADDR` set to true. pub fn bind(addr: A) -> io::Result { diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index add2fa7920..3d15e08400 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -172,16 +172,14 @@ fn tracing_subscriber_configured() -> bool { #[cfg(test)] mod tests { + use std::collections::HashSet; + use std::fmt::{self}; + use std::hash::{Hash, Hasher}; + use tracing_subscriber::prelude::*; use super::*; - use std::{ - collections::HashSet, - fmt::{self}, - hash::{Hash, Hasher}, - }; - struct MemoryIdentity<'a>(&'a dyn Extractor); impl MemoryIdentity<'_> { diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs index 6b53ab1316..30540c27d0 100644 --- a/libs/utils/src/try_rcu.rs +++ b/libs/utils/src/try_rcu.rs @@ -44,10 +44,12 @@ where #[cfg(test)] mod tests { - use super::*; - use arc_swap::ArcSwap; use std::sync::Arc; + use arc_swap::ArcSwap; + + use super::*; + #[test] fn test_try_rcu_success() { let swap = ArcSwap::from(Arc::new(42)); diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 1fe048c6f0..eded86af3e 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,4 +1,6 @@ -use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +use std::alloc::Layout; +use std::cmp::Ordering; +use std::ops::RangeBounds; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VecMapOrdering { @@ -214,7 +216,8 @@ fn extract_key(entry: &(K, V)) -> &K { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, ops::Bound}; + use std::collections::BTreeMap; + use std::ops::Bound; use super::{VecMap, VecMapOrdering}; diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs index be2dcc00f5..96c2a83951 100644 --- a/libs/utils/src/zstd.rs +++ b/libs/utils/src/zstd.rs @@ -1,19 +1,14 @@ use std::io::SeekFrom; use anyhow::{Context, Result}; -use async_compression::{ - tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, - zstd::CParameter, - Level, -}; +use async_compression::Level; +use async_compression::tokio::bufread::ZstdDecoder; +use async_compression::tokio::write::ZstdEncoder; +use async_compression::zstd::CParameter; use camino::Utf8Path; use nix::NixPath; -use tokio::{ - fs::{File, OpenOptions}, - io::AsyncBufRead, - io::AsyncSeekExt, - io::AsyncWriteExt, -}; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{AsyncBufRead, AsyncSeekExt, AsyncWriteExt}; use tokio_tar::{Archive, Builder, HeaderMode}; use walkdir::WalkDir; diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index b995b61b78..e0c8cdde00 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -1,7 +1,8 @@ +use std::io::Read; + use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; -use std::io::Read; use utils::bin_ser::LeSer; #[derive(Debug, PartialEq, Eq, Deserialize)] diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index 846904cf87..ed6ba4d267 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -1,23 +1,25 @@ -use anyhow::Context; -use criterion::{criterion_group, criterion_main, Criterion}; -use futures::{stream::FuturesUnordered, StreamExt}; -use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; -use pprof::criterion::{Output, PProfProfiler}; -use serde::Deserialize; -use std::{env, num::NonZeroUsize, sync::Arc}; +use std::env; +use std::num::NonZeroUsize; +use std::sync::Arc; +use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::Utf8TempDir; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, S3Config, }; +use serde::Deserialize; use tokio_util::sync::CancellationToken; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber}, -}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber}; use wal_decoder::models::InterpretedWalRecord; const S3_BUCKET: &str = "neon-github-public-dev"; @@ -31,7 +33,7 @@ const METADATA_FILENAME: &str = "metadata.json"; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; async fn create_s3_client() -> anyhow::Result> { diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index ebb38ceb52..cb0835e894 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -3,8 +3,6 @@ use std::collections::HashMap; -use crate::models::*; -use crate::serialized_batch::SerializedValueBatch; use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -14,6 +12,9 @@ use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::walrecord::*; use utils::lsn::Lsn; +use crate::models::*; +use crate::serialized_batch::SerializedValueBatch; + impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. /// Data blocks which do not match any of the provided shard identities are filtered out. diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index d76f75f51f..b451d6d8e0 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -8,20 +8,18 @@ use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{CompactKey, Key, rel_block_to_key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; -use pageserver_api::{key::CompactKey, value::Value}; +use pageserver_api::value::Value; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; -use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ}; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use pageserver_api::key::Key; - use crate::models::InterpretedWalRecord; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); @@ -515,10 +513,11 @@ impl SerializedValueBatch { let empty = self.raw.is_empty(); if cfg!(debug_assertions) && empty { - assert!(self - .metadata - .iter() - .all(|meta| matches!(meta, ValueMeta::Observed(_)))); + assert!( + self.metadata + .iter() + .all(|meta| matches!(meta, ValueMeta::Observed(_))) + ); } !empty diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 52ed5c70b5..5a28128dd8 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -7,15 +7,12 @@ use utils::lsn::Lsn; use utils::postgres_client::{Compression, InterpretedFormat}; use crate::models::{ - FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, + FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, proto, }; - use crate::serialized_batch::{ ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta, }; -use crate::models::proto; - #[derive(Debug, thiserror::Error)] pub enum ToWireFormatError { #[error("{0}")] @@ -83,8 +80,8 @@ impl ToWireFormat for InterpretedWalRecords { format: InterpretedFormat, compression: Option, ) -> Result { - use async_compression::tokio::write::ZstdEncoder; use async_compression::Level; + use async_compression::tokio::write::ZstdEncoder; let encode_res: Result = match format { InterpretedFormat::Bincode => { diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 8d5b1ade35..530ceb1327 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -1,9 +1,11 @@ //! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h //! to generate Rust bindings for it. -use std::{env, path::PathBuf, process::Command}; +use std::env; +use std::path::PathBuf; +use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; const WALPROPOSER_PG_VERSION: &str = "v17"; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 2fbea3fe45..d660602149 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -3,27 +3,14 @@ #![allow(dead_code)] -use std::ffi::CStr; -use std::ffi::CString; +use std::ffi::{CStr, CString}; -use crate::bindings::uint32; -use crate::bindings::walproposer_api; -use crate::bindings::NeonWALReadResult; -use crate::bindings::PGAsyncReadResult; -use crate::bindings::PGAsyncWriteResult; -use crate::bindings::Safekeeper; -use crate::bindings::Size; -use crate::bindings::StringInfoData; -use crate::bindings::TimestampTz; -use crate::bindings::WalProposer; -use crate::bindings::WalProposerConnStatusType; -use crate::bindings::WalProposerConnectPollStatusType; -use crate::bindings::WalProposerExecStatusType; -use crate::bindings::WalproposerShmemState; -use crate::bindings::XLogRecPtr; -use crate::walproposer::ApiImpl; -use crate::walproposer::StreamingCallback; -use crate::walproposer::WaitResult; +use crate::bindings::{ + NeonWALReadResult, PGAsyncReadResult, PGAsyncWriteResult, Safekeeper, Size, StringInfoData, + TimestampTz, WalProposer, WalProposerConnStatusType, WalProposerConnectPollStatusType, + WalProposerExecStatusType, WalproposerShmemState, XLogRecPtr, uint32, walproposer_api, +}; +use crate::walproposer::{ApiImpl, StreamingCallback, WaitResult}; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { unsafe { diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index ba75171db2..4e50c21fca 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -2,15 +2,15 @@ use std::ffi::CString; -use crate::{ - api_bindings::{create_api, take_vec_u8, Level}, - bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, - WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, - }, -}; use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use crate::api_bindings::{Level, create_api, take_vec_u8}; +use crate::bindings::{ + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, +}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. @@ -215,6 +215,7 @@ impl Wrapper { syncSafekeepers: config.sync_safekeepers, systemId: 0, pgTimeline: 1, + proto_version: 3, callback_data, }; let c_config = Box::into_raw(Box::new(c_config)); @@ -274,21 +275,17 @@ impl StreamingCallback { #[cfg(test)] mod tests { use core::panic; - use std::{ - cell::Cell, - sync::{atomic::AtomicUsize, mpsc::sync_channel}, - }; + use std::cell::{Cell, UnsafeCell}; + use std::ffi::CString; + use std::sync::atomic::AtomicUsize; + use std::sync::mpsc::sync_channel; - use std::cell::UnsafeCell; use utils::id::TenantTimelineId; - use crate::{ - api_bindings::Level, - bindings::{NeonWALReadResult, PG_VERSION_NUM}, - walproposer::Wrapper, - }; - use super::ApiImpl; + use crate::api_bindings::Level; + use crate::bindings::{NeonWALReadResult, PG_VERSION_NUM}; + use crate::walproposer::Wrapper; #[derive(Clone, Copy, Debug)] struct WaitEventsData { @@ -496,57 +493,64 @@ mod tests { // Messages definitions are at walproposer.h // xxx: it would be better to extract them from safekeeper crate and // use serialization/deserialization here. - let greeting_tag = (b'g' as u64).to_ne_bytes(); - let proto_version = 2_u32.to_ne_bytes(); - let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); - let proposer_id = [0; 16]; - let system_id = 0_u64.to_ne_bytes(); - let tenant_id = ttid.tenant_id.as_arr(); - let timeline_id = ttid.timeline_id.as_arr(); - let pg_tli = 1_u32.to_ne_bytes(); - let wal_seg_size = 16777216_u32.to_ne_bytes(); + let greeting_tag = (b'g').to_be_bytes(); + let tenant_id = CString::new(ttid.tenant_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let timeline_id = CString::new(ttid.timeline_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let mconf_gen = 0_u32.to_be_bytes(); + let mconf_members_len = 0_u32.to_be_bytes(); + let mconf_members_new_len = 0_u32.to_be_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes(); + let system_id = 0_u64.to_be_bytes(); + let wal_seg_size = 16777216_u32.to_be_bytes(); + let proposer_greeting = [ greeting_tag.as_slice(), - proto_version.as_slice(), - pg_version.as_slice(), - proposer_id.as_slice(), - system_id.as_slice(), tenant_id.as_slice(), timeline_id.as_slice(), - pg_tli.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + pg_version.as_slice(), + system_id.as_slice(), wal_seg_size.as_slice(), ] .concat(); - let voting_tag = (b'v' as u64).to_ne_bytes(); - let vote_request_term = 3_u64.to_ne_bytes(); - let proposer_id = [0; 16]; + let voting_tag = (b'v').to_be_bytes(); + let vote_request_term = 3_u64.to_be_bytes(); let vote_request = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_request_term.as_slice(), - proposer_id.as_slice(), ] .concat(); - let acceptor_greeting_term = 2_u64.to_ne_bytes(); - let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting_term = 2_u64.to_be_bytes(); + let acceptor_greeting_node_id = 1_u64.to_be_bytes(); let acceptor_greeting = [ greeting_tag.as_slice(), - acceptor_greeting_term.as_slice(), acceptor_greeting_node_id.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + acceptor_greeting_term.as_slice(), ] .concat(); - let vote_response_term = 3_u64.to_ne_bytes(); - let vote_given = 1_u64.to_ne_bytes(); - let flush_lsn = 0x539_u64.to_ne_bytes(); - let truncate_lsn = 0x539_u64.to_ne_bytes(); - let th_len = 1_u32.to_ne_bytes(); - let th_term = 2_u64.to_ne_bytes(); - let th_lsn = 0x539_u64.to_ne_bytes(); - let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response_term = 3_u64.to_be_bytes(); + let vote_given = 1_u8.to_be_bytes(); + let flush_lsn = 0x539_u64.to_be_bytes(); + let truncate_lsn = 0x539_u64.to_be_bytes(); + let th_len = 1_u32.to_be_bytes(); + let th_term = 2_u64.to_be_bytes(); + let th_lsn = 0x539_u64.to_be_bytes(); let vote_response = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_response_term.as_slice(), vote_given.as_slice(), flush_lsn.as_slice(), @@ -554,7 +558,6 @@ mod tests { th_len.as_slice(), th_term.as_slice(), th_lsn.as_slice(), - timeline_start_lsn.as_slice(), ] .concat(); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9d4463d595..7330856be4 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index b67a9cc479..b1103948d6 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,20 @@ -use std::{env, num::NonZeroUsize}; +use std::env; +use std::num::NonZeroUsize; use bytes::Bytes; use camino::Utf8PathBuf; -use criterion::{criterion_group, criterion_main, Criterion}; -use pageserver::{ - config::PageServerConf, - context::{DownloadBehavior, RequestContext}, - l0_flush::{L0FlushConfig, L0FlushGlobalState}, - page_cache, - task_mgr::TaskKind, - tenant::storage_layer::InMemoryLayer, - virtual_file, -}; -use pageserver_api::{key::Key, shard::TenantShardId, value::Value}; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, -}; +use criterion::{Criterion, criterion_group, criterion_main}; +use pageserver::config::PageServerConf; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5c5b52db44..e11af49449 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,23 +1,21 @@ -use criterion::measurement::WallTime; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; -use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::storage_layer::PersistentLayerDesc; -use pageserver_api::key::Key; -use pageserver_api::shard::TenantShardId; -use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use std::time::Instant; + +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; +use pageserver::keyspace::{KeyPartitioning, KeySpace}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use utils::id::{TenantId, TimelineId}; - use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; - fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index d3551b56e1..77b3f90b3e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -56,20 +56,23 @@ //! medium/128 time: [10.412 ms 10.574 ms 10.718 ms] //! ``` +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; -use pageserver::{config::PageServerConf, walredo::PostgresRedoManager}; +use pageserver::config::PageServerConf; +use pageserver::walredo::PostgresRedoManager; +use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{key::Key, shard::TenantShardId}; -use std::{ - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio::{sync::Barrier, task::JoinSet}; -use utils::{id::TenantId, lsn::Lsn}; +use pageserver_api::shard::TenantShardId; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use utils::id::TenantId; +use utils::lsn::Lsn; fn bench(c: &mut Criterion) { macro_rules! bench_group { diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs index ed5daa8ae1..8de06a6c25 100644 --- a/pageserver/benches/upload_queue.rs +++ b/pageserver/benches/upload_queue.rs @@ -1,15 +1,15 @@ //! Upload queue benchmarks. use std::str::FromStr as _; -use std::sync::atomic::AtomicU32; use std::sync::Arc; +use std::sync::atomic::AtomicU32; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; +use pageserver::tenant::IndexPart; use pageserver::tenant::metadata::TimelineMetadata; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; -use pageserver::tenant::IndexPart; use pprof::criterion::{Output, PProfProfiler}; use utils::generation::Generation; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index bb0f64ca32..37c914c4e9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,17 +1,15 @@ -use std::{collections::HashMap, error::Error as _}; +use std::collections::HashMap; +use std::error::Error as _; use bytes::Bytes; -use reqwest::{IntoUrl, Method, StatusCode}; - use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; -use pageserver_api::{models::*, shard::TenantShardId}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - +use pageserver_api::models::*; +use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; +use reqwest::{IntoUrl, Method, StatusCode}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::BlockUnblock; @@ -482,6 +480,7 @@ impl Client { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", @@ -489,6 +488,9 @@ impl Client { )) .expect("Cannot build URL"); + path.query_pairs_mut() + .append_pair("recurse", &format!("{}", recurse)); + if let Some(concurrency) = concurrency { path.query_pairs_mut() .append_pair("concurrency", &format!("{}", concurrency)); diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 47da83b0eb..ef35ac2f48 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,23 +1,16 @@ use std::sync::{Arc, Mutex}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; -use pageserver_api::{ - models::{ - PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, - PagestreamGetPageResponse, - }, - reltag::RelTag, +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; +use pageserver_api::models::{ + PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, }; +use pageserver_api::reltag::RelTag; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; use tokio_util::sync::CancellationToken; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; pub struct Client { client: tokio_postgres::Client, diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index c308694ae1..dd35417333 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -1,11 +1,11 @@ -use clap::{Parser, Subcommand}; -use pageserver_compaction::helpers::PAGE_SZ; -use pageserver_compaction::simulator::MockTimeline; -use rand::Rng; use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::OnceLock; +use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -157,8 +157,9 @@ async fn run_suite() -> anyhow::Result<()> { use std::fs::File; use std::io::Stdout; use std::sync::Mutex; -use tracing_subscriber::fmt::writer::EitherWriter; + use tracing_subscriber::fmt::MakeWriter; +use tracing_subscriber::fmt::writer::EitherWriter; static LOG_FILE: OnceLock>> = OnceLock::new(); fn get_log_output() -> &'static Mutex> { diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 7779ffaf8b..75f43d7ff7 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -17,20 +17,19 @@ //! distance of image layers in LSN dimension is roughly equal to the logical //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. -use futures::StreamExt; -use pageserver_api::shard::ShardIdentity; -use tracing::{debug, info}; - use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{ - accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, -}; -use crate::interface::*; +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use tracing::{debug, info}; use utils::lsn::Lsn; +use crate::helpers::{ + PAGE_SZ, accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::identify_levels::identify_level; +use crate::interface::*; /// Main entry point to compaction. /// @@ -307,7 +306,7 @@ where let mut layer_ids: Vec = Vec::new(); for layer_id in &job.input_layers { let layer = &self.layers[layer_id.0].layer; - if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(layer, ctx).await? { deltas.push(dl.clone()); layer_ids.push(*layer_id); } @@ -536,15 +535,16 @@ where let mut deltas: Vec = Vec::new(); for layer_id in &job.input_layers { let l = &self.layers[layer_id.0]; - if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer, ctx).await? { deltas.push(dl.clone()); } } // Open stream - let key_value_stream = - std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + let key_value_stream = std::pin::pin!( + merge_delta_keys_buffered::(deltas.as_slice(), ctx) .await? - .map(Result::<_, anyhow::Error>::Ok)); + .map(Result::<_, anyhow::Error>::Ok) + ); let mut new_jobs = Vec::new(); // Slide a window through the keyspace diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 6b739d85a7..421802eef3 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -1,21 +1,21 @@ //! This file contains generic utility functions over the interface types, //! which could be handy for any compaction implementation. -use crate::interface::*; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::Display; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{Poll, ready}; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; -use std::collections::BinaryHeap; -use std::collections::VecDeque; -use std::fmt::Display; -use std::future::Future; -use std::ops::{DerefMut, Range}; -use std::pin::Pin; -use std::task::{ready, Poll}; use utils::lsn::Lsn; +use crate::interface::*; + pub const PAGE_SZ: u64 = 8192; pub fn keyspace_total_size( @@ -221,12 +221,12 @@ where // performed implicitly when `top` is dropped). if let Some(mut top) = this.heap.peek_mut() { match top.deref_mut() { - LazyLoadLayer::Unloaded(ref mut l) => { + LazyLoadLayer::Unloaded(l) => { let fut = l.load_keys(this.ctx); this.load_future.set(Some(Box::pin(fut))); continue; } - LazyLoadLayer::Loaded(ref mut entries) => { + LazyLoadLayer::Loaded(entries) => { let result = entries.pop_front().unwrap(); if entries.is_empty() { std::collections::binary_heap::PeekMut::pop(top); diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index e04bd15396..61575e3992 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -26,15 +26,15 @@ //! file size, the file will still be considered to be part of L0 at the next //! iteration. -use anyhow::bail; use std::collections::BTreeSet; use std::ops::Range; + +use anyhow::bail; +use tracing::{info, trace}; use utils::lsn::Lsn; use crate::interface::*; -use tracing::{info, trace}; - pub struct Level { pub lsn_range: Range, pub layers: Vec, @@ -60,7 +60,11 @@ where if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { // shouldn't happen. Indicates that the caller passed a bogus // end_lsn. - bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + bail!( + "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", + end_lsn, + l.short_id() + ); } // include image layers sitting exacty at `end_lsn`. let is_image = !l.is_delta(); @@ -246,9 +250,10 @@ impl Level { #[cfg(test)] mod tests { + use std::sync::{Arc, Mutex}; + use super::*; use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; - use std::sync::{Arc, Mutex}; fn delta(key_range: Range, lsn_range: Range) -> MockLayer { MockLayer::Delta(Arc::new(MockDeltaLayer { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 8ed393a645..63fbc565cc 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,9 +3,12 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use futures::Future; -use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; use std::ops::Range; + +use futures::Future; +use pageserver_api::key::Key; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::ShardIdentity; use utils::lsn::Lsn; /// Public interface. This is the main thing that the implementor needs to provide @@ -55,6 +58,7 @@ pub trait CompactionJobExecutor { fn downcast_delta_layer( &self, layer: &Self::Layer, + ctx: &Self::RequestContext, ) -> impl Future>> + Send; // ---- diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 673b80c313..bf9f6f2658 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -1,22 +1,17 @@ mod draw; -use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::{Arc, Mutex}; +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; - use utils::lsn::Lsn; -use std::fmt::Write; -use std::ops::Range; -use std::sync::Arc; -use std::sync::Mutex; - -use crate::helpers::PAGE_SZ; -use crate::helpers::{merge_delta_keys, overlaps_with}; - +use crate::helpers::{PAGE_SZ, merge_delta_keys, overlaps_with}; use crate::interface; use crate::interface::CompactionLayer; @@ -487,6 +482,7 @@ impl interface::CompactionJobExecutor for MockTimeline { async fn downcast_delta_layer( &self, layer: &MockLayer, + _ctx: &MockRequestContext, ) -> anyhow::Result>> { Ok(match layer { MockLayer::Delta(l) => Some(l.clone()), diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 4559db09f1..3d35d1b91e 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -1,14 +1,14 @@ -use super::Key; -use anyhow::Result; use std::cmp::Ordering; -use std::{ - collections::{BTreeMap, BTreeSet, HashSet}, - fmt::Write, - ops::Range, -}; -use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::fmt::Write; +use std::ops::Range; + +use anyhow::Result; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, Style, rgb}; use utils::lsn::Lsn; +use super::Key; + // Map values to their compressed coordinate - the index the value // would have in a sorted and deduplicated list of all values. struct CoordinateMap { diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 177e65ef79..80ca414543 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -50,18 +50,18 @@ //! ``` //! -use anyhow::{Context, Result}; -use pageserver_api::key::Key; use std::cmp::Ordering; +use std::collections::{BTreeMap, BTreeSet}; use std::io::{self, BufRead}; +use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; -use std::{ - collections::{BTreeMap, BTreeSet}, - ops::Range, -}; -use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke}; -use utils::{lsn::Lsn, project_git_version}; + +use anyhow::{Context, Result}; +use pageserver_api::key::Key; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, rectangle, rgb}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index c7f0719c41..600f7c412e 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -1,11 +1,10 @@ +use std::str::FromStr; + use anyhow::Context; use clap::Parser; -use pageserver_api::{ - key::Key, - reltag::{BlockNumber, RelTag, SlruKind}, - shard::{ShardCount, ShardStripeSize}, -}; -use std::str::FromStr; +use pageserver_api::key::Key; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::{ShardCount, ShardStripeSize}; #[derive(Parser)] pub(super) struct DescribeKeyCommand { @@ -394,7 +393,10 @@ mod tests { fn single_positional_spanalike_is_key_material() { // why is this needed? if you are checking many, then copypaste starts to appeal let strings = [ - (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + ( + line!(), + "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0", + ), (line!(), "rel=1663/208101/2620_fsm blkno=2"), (line!(), "rel=1663/208101/2620.1 blkno=2"), ]; @@ -420,7 +422,15 @@ mod tests { #[test] fn multiple_spanlike_args() { let strings = [ - (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + ( + line!(), + &[ + "process_query{tenant_id=C", + "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", + "blkno=2", + "req_lsn=0/238D98C8}", + ][..], + ), (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), (line!(), &["1663/208101/2620_fsm", "2"][..]), ]; diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 2c350d6d86..b426f977cf 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -2,27 +2,27 @@ //! //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. -use anyhow::{anyhow, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use pageserver::context::{DownloadBehavior, RequestContext}; -use pageserver::task_mgr::TaskKind; -use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use pageserver::virtual_file::api::IoMode; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; use std::str::FromStr; use std::{fs, str}; +use anyhow::{Result, anyhow}; +use camino::{Utf8Path, Utf8PathBuf}; +use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::page_cache::{self, PAGE_SZ}; +use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; -use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; -use pageserver::tenant::storage_layer::{range_overlaps, LayerName}; +use pageserver::tenant::storage_layer::delta_layer::{DELTA_KEY_SIZE, Summary}; +use pageserver::tenant::storage_layer::{LayerName, range_overlaps}; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use pageserver::virtual_file::{self, VirtualFile}; -use pageserver_api::key::{Key, KEY_SIZE}; - -use utils::{bin_ser::BeSer, lsn::Lsn}; +use pageserver_api::key::{KEY_SIZE, Key}; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; use crate::AnalyzeLayerMapCmd; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 4c2c3ab30e..05fb35ff09 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -1,3 +1,4 @@ +use std::fs::{self, File}; use std::path::{Path, PathBuf}; use anyhow::Result; @@ -5,12 +6,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; -use pageserver::tenant::storage_layer::{delta_layer, image_layer}; -use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; +use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, image_layer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; -use std::fs::{self, File}; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 353b4bd2f9..72a120a69b 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -11,33 +11,29 @@ mod layer_map_analyzer; mod layers; mod page_trace; -use page_trace::PageTraceCmd; -use std::{ - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; use layers::LayerCmd; -use pageserver::{ - context::{DownloadBehavior, RequestContext}, - page_cache, - task_mgr::TaskKind, - tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file::{self, api::IoMode}, -}; +use page_trace::PageTraceCmd; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::page_cache; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::dump_layerfile_from_path; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::virtual_file::api::IoMode; +use pageserver::virtual_file::{self}; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; -use utils::{ - id::TimelineId, - logging::{self, LogFormat, TracingErrorLayerEnablement}, - lsn::Lsn, - project_git_version, -}; +use utils::id::TimelineId; +use utils::logging::{self, LogFormat, TracingErrorLayerEnablement}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -355,7 +351,9 @@ mod tests { assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); - assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_valid( + "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683", + ); assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); } } diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index b869a0c6c7..bab17540f5 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -1,12 +1,12 @@ +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + use pageserver_api::models::{TenantConfig, TenantConfigRequest}; use pageserver_api::shard::TenantShardId; use utils::id::TenantTimelineId; use utils::lsn::Lsn; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Instant; - /// Ingest aux files into the pageserver. #[derive(clap::Parser)] pub(crate) struct Args { diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 3ae6d99aa7..51d7d5df89 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,16 +1,3 @@ -use anyhow::Context; -use pageserver_api::shard::TenantShardId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; -use pageserver_client::page_service::BasebackupRequest; - -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::sync::Barrier; -use tokio::task::JoinSet; -use tracing::{info, instrument}; - use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; @@ -18,6 +5,17 @@ use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; +use anyhow::Context; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; +use pageserver_client::page_service::BasebackupRequest; +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a60efc7567..617676c079 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,18 +1,3 @@ -use anyhow::Context; -use camino::Utf8PathBuf; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; - -use pageserver_api::shard::TenantShardId; -use tokio_util::sync::CancellationToken; -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::task::JoinSet; -use tracing::info; - use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; @@ -21,6 +6,19 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; +use anyhow::Context; +use camino::Utf8PathBuf; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::shard::TenantShardId; +use rand::prelude::*; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 1bb71b9353..3194e2e753 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -1,23 +1,19 @@ -use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; +use std::f64; +use std::num::NonZeroUsize; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, Instant}; +use pageserver_api::models::HistoricLayerInfo; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use rand::seq::SliceRandom; +use tokio::sync::{OwnedSemaphorePermit, mpsc}; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; -use std::{f64, sync::Arc}; -use tokio::{ - sync::{mpsc, OwnedSemaphorePermit}, - task::JoinSet, -}; - -use std::{ - num::NonZeroUsize, - sync::atomic::{AtomicU64, Ordering}, - time::{Duration, Instant}, -}; - /// Evict & on-demand download random layers. #[derive(clap::Parser)] pub(crate) struct Args { diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index f07beeecfd..16abbf9ffd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -2,11 +2,10 @@ use std::sync::Arc; use humantime::Duration; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use tokio::task::JoinSet; use utils::id::TenantTimelineId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; - #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 4aa6950782..ebe7bc031d 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -40,9 +40,7 @@ impl Stats { } } pub(crate) fn add(&mut self, other: &Self) { - let Self { - ref mut latency_histo, - } = self; + let Self { latency_histo } = self; latency_histo.add(&other.latency_histo).unwrap(); } } diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs index 66ca7fd057..c4b8d9acba 100644 --- a/pageserver/src/assert_u64_eq_usize.rs +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -2,7 +2,9 @@ pub(crate) const _ASSERT_U64_EQ_USIZE: () = { if std::mem::size_of::() != std::mem::size_of::() { - panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + panic!( + "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information" + ); } }; diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5cc20a70b2..b76c0e045f 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; -use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE}; use tracing::warn; // BEGIN Copyright (c) 2017 Servo Contributors diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 99b0775316..ce54bd9c1c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,33 +10,31 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, Context}; -use bytes::{BufMut, Bytes, BytesMut}; -use fail::fail_point; -use pageserver_api::key::{rel_block_to_key, Key}; -use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; + +use anyhow::{Context, anyhow}; +use bytes::{BufMut, Bytes, BytesMut}; +use fail::fail_point; +use pageserver_api::key::{Key, rel_block_to_key}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants::{ + DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES, +}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; +use postgres_ffi::{ + BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, +}; use tokio::io; use tokio::io::AsyncWrite; -use tracing::*; - use tokio_tar::{Builder, EntryType, Header}; +use tracing::*; +use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; -use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::Timeline; -use pageserver_api::reltag::{RelTag, SlruKind}; - -use postgres_ffi::dispatch_pgversion; -use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; -use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::XLogFileName; -use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; +use crate::tenant::storage_layer::IoConcurrency; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index e2b9a7f073..703629aed5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,49 +3,42 @@ //! Main entry point for the Page Server executable. use std::env; -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::io::Read; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; - -use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::config::PageserverIdentity; +use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; +use metrics::set_build_info_metric; +use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; +use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; -use pageserver::tenant::{secondary, TenantSharedResources}; -use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; +use pageserver::task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, +}; +use pageserver::tenant::{TenantSharedResources, mgr, secondary}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service, + task_mgr, virtual_file, +}; +use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; - -use metrics::set_build_info_metric; -use pageserver::{ - config::PageServerConf, - deletion_queue::DeletionQueue, - http, page_cache, page_service, task_mgr, - task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, - tenant::mgr, - virtual_file, -}; -use postgres_backend::AuthType; +use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; -use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::{ - auth::{JwtAuth, SwappableJwtAuth}, - logging, project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, -}; +use utils::sentry_init::init_sentry; +use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -57,7 +50,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; @@ -85,6 +78,9 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // Initialize up failpoints support + let scenario = failpoint_support::init(); + let workdir = arg_matches .get_one::("workdir") .map(Utf8Path::new) @@ -178,9 +174,6 @@ fn main() -> anyhow::Result<()> { } } - // Initialize up failpoints support - let scenario = failpoint_support::init(); - // Basic initialization of things that don't change after startup tracing::info!("Initializing virtual_file..."); virtual_file::init( @@ -217,7 +210,9 @@ fn initialize_config( Ok(mut f) => { let md = f.metadata().context("stat config file")?; if !md.is_file() { - anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); + anyhow::bail!( + "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..." + ); } let mut s = String::new(); @@ -225,7 +220,9 @@ fn initialize_config( toml_edit::de::from_str::(&s)? } Err(e) => { - anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + anyhow::bail!( + "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..." + ); } }; @@ -351,6 +348,13 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. + // These are configured via net.ipv4.tcp_keepalive_* sysctls. + // + // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't + // support enabling keepalives while using the default OS sysctls. + setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; + // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME @@ -401,11 +405,9 @@ fn start_pageserver( Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { - return Err(e).with_context(|| { - "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" - }) - } + Err(e) => return Err(e).with_context( + || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable", + ), }; // Top-level cancellation token for the process @@ -711,7 +713,9 @@ async fn create_remote_storage_client( // wrapper that simulates failures. if conf.test_remote_failures > 0 { if !cfg!(feature = "testing") { - anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + anyhow::bail!( + "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature" + ); } info!( "Simulating remote failures for first {} attempts of each op", diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs index c1ce332b6c..0215dd06fb 100644 --- a/pageserver/src/bin/test_helper_slow_client_reads.rs +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -1,14 +1,10 @@ -use std::{ - io::{stdin, stdout, Read, Write}, - time::Duration, -}; +use std::io::{Read, Write, stdin, stdout}; +use std::time::Duration; use clap::Parser; use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; #[derive(clap::Parser)] struct Args { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 09d9444dd5..64d00882b9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,36 +4,29 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context}; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::{ - config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, - shard::TenantShardId, -}; -use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; -use storage_broker::Uri; -use utils::logging::SecretString; -use utils::postgres_client::PostgresClientProtocol; - -use once_cell::sync::OnceCell; -use reqwest::Url; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; +use once_cell::sync::OnceCell; +use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; use postgres_backend::AuthType; -use utils::{ - id::{NodeId, TimelineId}, - logging::LogFormat, -}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use reqwest::Url; +use storage_broker::Uri; +use utils::id::{NodeId, TimelineId}; +use utils::logging::{LogFormat, SecretString}; +use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::virtual_file; use crate::virtual_file::io_engine; -use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// Global state of pageserver. /// @@ -440,7 +433,9 @@ impl PageServerConf { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + eprintln!( + "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}" + ); engine } }, diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7e8c00c293..0231190e69 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,13 +1,9 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. -use crate::config::PageServerConf; -use crate::consumption_metrics::metrics::MetricsKey; -use crate::consumption_metrics::upload::KeyGen as _; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::size::CalculateSyntheticSizeError; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + use camino::Utf8PathBuf; use consumption_metrics::EventType; use itertools::Itertools as _; @@ -15,14 +11,21 @@ use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, SystemTime}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; +use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; + mod disk_cache; mod metrics; mod upload; diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs index 54a505a134..f1dad8793d 100644 --- a/pageserver/src/consumption_metrics/disk_cache.rs +++ b/pageserver/src/consumption_metrics/disk_cache.rs @@ -1,10 +1,10 @@ -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; use std::sync::Arc; -use crate::consumption_metrics::NewMetricsRefRoot; +use anyhow::Context; +use camino::{Utf8Path, Utf8PathBuf}; use super::{NewMetricsRoot, NewRawMetric, RawMetric}; +use crate::consumption_metrics::NewMetricsRefRoot; pub(super) fn read_metrics_from_serde_value( json_value: serde_json::Value, diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 07fac09f6f..71910011ea 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,15 +1,16 @@ -use crate::tenant::mgr::TenantManager; -use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; +use std::sync::Arc; +use std::time::SystemTime; + use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use std::{sync::Arc, time::SystemTime}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; +use crate::context::RequestContext; +use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` /// instead of static str. diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 3ed7b44123..52b4fb8680 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,7 @@ -use crate::consumption_metrics::RawMetric; +use std::collections::HashMap; use super::*; -use std::collections::HashMap; +use crate::consumption_metrics::RawMetric; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 448bf47525..59e0145a5b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -2,15 +2,16 @@ use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; -use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; - -use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric}; use utils::id::{TenantId, TimelineId}; +use super::metrics::Name; +use super::{Cache, MetricsKey, NewRawMetric, RawMetric}; + /// How the metrics from pageserver are identified. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] struct Ids { @@ -438,14 +439,13 @@ async fn upload( #[cfg(test)] mod tests { - use crate::consumption_metrics::{ - disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot, - }; - - use super::*; use chrono::{DateTime, Utc}; use once_cell::sync::Lazy; + use super::*; + use crate::consumption_metrics::NewMetricsRefRoot; + use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value; + #[test] fn chunked_serialization() { let examples = metric_samples(); diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 4990f17b40..6d5c727958 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -1,21 +1,23 @@ use std::collections::HashMap; use futures::Future; -use pageserver_api::{ - controller_api::{AvailabilityZone, NodeRegisterRequest}, - shard::TenantShardId, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateRequestTenant, ValidateResponse, - }, +use pageserver_api::config::NodeMetadata; +use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest}; +use pageserver_api::shard::TenantShardId; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::{backoff, failpoint_support}; -use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; -use pageserver_api::config::NodeMetadata; +use crate::config::PageServerConf; +use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). @@ -82,6 +84,7 @@ impl ControllerUpcallClient { }) } + #[tracing::instrument(skip_all)] async fn retry_http_forever( &self, url: &url::Url, @@ -106,7 +109,7 @@ impl ControllerUpcallClient { |_| false, 3, u32::MAX, - "calling control plane generation validation API", + "storage controller upcall", &self.cancel, ) .await @@ -123,11 +126,12 @@ impl ControllerUpcallClient { impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn re_attach( &self, conf: &PageServerConf, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); @@ -157,14 +161,18 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { match az_id_from_metadata { Some(az_id) => Some(AvailabilityZone(az_id)), None => { - tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + tracing::warn!( + "metadata.json does not contain an 'availability_zone_id' field" + ); conf.availability_zone.clone().map(AvailabilityZone) } } }; if az_id.is_none() { - panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + panic!( + "Availablity zone id could not be inferred from metadata.json or pageserver config" + ); } Some(NodeRegisterRequest { @@ -199,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; + let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -217,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { } /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("validate") .expect("Failed to build validate path"); @@ -236,7 +245,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .iter() .map(|(id, generation)| ValidateRequestTenant { id: *id, - gen: (*generation).into().expect( + r#gen: (*generation).into().expect( "Generation should always be valid for a Tenant doing deletions", ), }) @@ -251,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = - self.retry_http_forever(&re_attach_path, request).await?; + let response: ValidateResponse = self.retry_http_forever(&url, request).await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index a2395b0dca..8118f66252 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,38 +6,31 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::metrics; -use crate::tenant::remote_timeline_client::remote_timeline_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::virtual_file::MaybeFatalIo; -use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; -use serde::Deserialize; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio_util::sync::CancellationToken; -use tracing::Instrument; -use tracing::{debug, error}; +use tracing::{Instrument, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; -use utils::lsn::AtomicLsn; -use utils::lsn::Lsn; - -use self::deleter::Deleter; -use self::list_writer::DeletionOp; -use self::list_writer::ListWriter; -use self::list_writer::RecoverOp; -use self::validator::Validator; -use deleter::DeleterMessage; -use list_writer::ListWriterQueueMessage; +use utils::lsn::{AtomicLsn, Lsn}; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; +use self::deleter::Deleter; +use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; +use self::validator::Validator; +use crate::config::PageServerConf; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; +use crate::tenant::storage_layer::LayerName; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; // TODO: configurable for how long to wait before executing deletions @@ -664,21 +657,22 @@ impl DeletionQueue { #[cfg(test)] mod test { + use std::io::ErrorKind; + use std::time::Duration; + use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant}; - use std::{io::ErrorKind, time::Duration}; - use tracing::info; - + use pageserver_api::key::Key; + use pageserver_api::shard::ShardIndex; + use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::task::JoinHandle; - - use crate::{ - controller_upcall_client::RetryForeverError, - tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, - }; + use tracing::info; use super::*; + use crate::controller_upcall_client::RetryForeverError; + use crate::tenant::harness::TenantHarness; + use crate::tenant::storage_layer::DeltaLayerName; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -724,26 +718,26 @@ mod test { .expect("Failed to join workers for previous deletion queue"); } - fn set_latest_generation(&self, gen: Generation) { + fn set_latest_generation(&self, gen_: Generation) { let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_shard_id, gen); + .insert(tenant_shard_id, gen_); } /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, file_name: LayerName, - gen: Generation, + gen_: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; - let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix()); let content: Vec = format!("placeholder contents of {file_name}").into(); @@ -1098,11 +1092,12 @@ mod test { /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. #[cfg(test)] pub(crate) mod mock { + use std::sync::atomic::{AtomicUsize, Ordering}; + use tracing::info; use super::*; use crate::tenant::remote_timeline_client::remote_layer_path; - use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index ef1dfbac19..691ba75cc7 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -6,21 +6,16 @@ //! number of full-sized DeleteObjects requests, rather than a larger number of //! smaller requests. -use remote_storage::GenericRemoteStorage; -use remote_storage::RemotePath; -use remote_storage::TimeoutOrCancel; use std::time::Duration; + +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use utils::backoff; -use utils::pausable_failpoint; +use tracing::{info, warn}; +use utils::{backoff, pausable_failpoint}; +use super::{DeletionQueueError, FlushOp}; use crate::metrics; -use super::DeletionQueueError; -use super::FlushOp; - const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); pub(super) enum DeleterMessage { diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index ae3b2c9180..a385e35a02 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -10,11 +10,6 @@ //! //! DeletionLists are passed onwards to the Validator. -use super::DeletionHeader; -use super::DeletionList; -use super::FlushOp; -use super::ValidatorQueueMessage; - use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; @@ -23,20 +18,17 @@ use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; +use tracing::{debug, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage}; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path}; use crate::tenant::storage_layer::LayerName; -use crate::virtual_file::on_fatal_io_error; -use crate::virtual_file::MaybeFatalIo; +use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error}; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 1d55581ebd..b0ce2b80b4 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -20,22 +20,14 @@ use std::time::Duration; use camino::Utf8PathBuf; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; - -use crate::config::PageServerConf; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::controller_upcall_client::RetryForeverError; -use crate::metrics; -use crate::virtual_file::MaybeFatalIo; +use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; -use super::DeletionHeader; -use super::DeletionList; -use super::DeletionQueueError; -use super::FlushOp; -use super::VisibleLsnUpdates; +use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; +use crate::config::PageServerConf; +use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError}; +use crate::metrics; +use crate::virtual_file::MaybeFatalIo; // After this length of time, do any validation work that is pending, // even if we haven't accumulated many keys to delete. @@ -190,7 +182,10 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!( + "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", + tenant_lsn_state.generation + ); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 738a783813..13252037e5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,30 +41,31 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; +use pageserver_api::config::DiskUsageEvictionTaskConfig; +use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{completion, id::TimelineId}; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::completion; +use utils::id::TimelineId; -use crate::{ - config::PageServerConf, - metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, BACKGROUND_RUNTIME}, - tenant::{ - mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, - secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, - tasks::sleep_random, - }, - CancellableTask, DiskUsageEvictionTask, +use crate::config::PageServerConf; +use crate::metrics::disk_usage_based_eviction::METRICS; +use crate::task_mgr::{self, BACKGROUND_RUNTIME}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::secondary::SecondaryTenant; +use crate::tenant::storage_layer::{ + AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint, }; +use crate::tenant::tasks::sleep_random; +use crate::{CancellableTask, DiskUsageEvictionTask}; /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. @@ -1007,10 +1008,14 @@ async fn collect_eviction_candidates( } } - debug_assert!(EvictionPartition::Above < EvictionPartition::Below, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!( + EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); + debug_assert!( + EvictionPartition::EvictNow < EvictionPartition::Above, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); eviction_order.sort(&mut candidates); @@ -1157,9 +1162,8 @@ mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; - use crate::statvfs::Statvfs; - use super::DiskUsageEvictionTaskConfig; + use crate::statvfs::Statvfs; #[derive(Debug, Clone, Copy)] pub struct Usage<'a> { @@ -1224,10 +1228,12 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::Usage as _; use std::time::Duration; + use utils::serde_percent::Percent; + use super::Usage as _; + let mut usage = Usage { config: &DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(85).unwrap(), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 56a84a98a8..a3ee31d6e6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2,125 +2,83 @@ //! Management HTTP API //! use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use enumset::EnumSet; use futures::future::join_all; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::{StreamExt, TryFutureExt}; use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, }; +use http_utils::error::{ApiError, HttpErrorBody}; use http_utils::failpoints::failpoints_handler; -use http_utils::request::must_parse_query_param; -use http_utils::request::{get_request_param, must_get_query_param, parse_query_param}; +use http_utils::json::{json_request, json_request_maybe, json_response}; +use http_utils::request::{ + get_request_param, must_get_query_param, must_parse_query_param, parse_query_param, + parse_request_param, +}; +use http_utils::{RequestExt, RouterBuilder}; use humantime::format_rfc3339; -use hyper::header; -use hyper::StatusCode; -use hyper::{Body, Request, Response, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; -use pageserver_api::models::IngestAuxFilesRequest; -use pageserver_api::models::ListAuxFilesRequest; -use pageserver_api::models::LocationConfig; -use pageserver_api::models::LocationConfigListResponse; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::models::LsnLease; -use pageserver_api::models::LsnLeaseRequest; -use pageserver_api::models::OffloadedTimelineInfo; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::models::ShardParameters; -use pageserver_api::models::TenantConfigPatchRequest; -use pageserver_api::models::TenantDetails; -use pageserver_api::models::TenantLocationConfigRequest; -use pageserver_api::models::TenantLocationConfigResponse; -use pageserver_api::models::TenantScanRemoteStorageResponse; -use pageserver_api::models::TenantScanRemoteStorageShard; -use pageserver_api::models::TenantShardLocation; -use pageserver_api::models::TenantShardSplitRequest; -use pageserver_api::models::TenantShardSplitResponse; -use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; -use pageserver_api::models::TenantWaitLsnRequest; -use pageserver_api::models::TimelineArchivalConfigRequest; -use pageserver_api::models::TimelineCreateRequestMode; -use pageserver_api::models::TimelineCreateRequestModeImportPgdata; -use pageserver_api::models::TimelinesInfoAndOffloaded; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::TopTenantShardsRequest; -use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::shard::ShardCount; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeTravelError; +use pageserver_api::models::{ + DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, + LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, + OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, + TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, + TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, + TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, +}; +use pageserver_api::shard::{ShardCount, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; -use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tenant_size_model::svg::SvgBranchKind; +use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::SwappableJwtAuth; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::config::PageServerConf; -use crate::context::RequestContextBuilder; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; -use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, - TenantSlotUpsertError, TenantStateError, + GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, + TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, +}; +use crate::tenant::remote_timeline_client::{ + download_index_part, list_remote_tenant_shards, list_remote_timelines, }; -use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; -use crate::tenant::remote_timeline_client; -use crate::tenant::remote_timeline_client::download_index_part; -use crate::tenant::remote_timeline_client::list_remote_tenant_shards; -use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; -use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerName; -use crate::tenant::timeline::import_pgdata; -use crate::tenant::timeline::offload::offload_timeline; -use crate::tenant::timeline::offload::OffloadError; -use crate::tenant::timeline::CompactFlags; -use crate::tenant::timeline::CompactOptions; -use crate::tenant::timeline::CompactRequest; -use crate::tenant::timeline::CompactionError; -use crate::tenant::timeline::Timeline; -use crate::tenant::timeline::WaitLsnTimeout; -use crate::tenant::timeline::WaitLsnWaiter; -use crate::tenant::GetTimelineError; -use crate::tenant::OffloadedTimeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::DEFAULT_PG_VERSION; -use crate::{disk_usage_eviction_task, tenant}; -use http_utils::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, +use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; +use crate::tenant::timeline::{ + CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, + WaitLsnWaiter, import_pgdata, }; -use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, -}; -use utils::{ - auth::SwappableJwtAuth, - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, +use crate::tenant::{ + GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, + remote_timeline_client, }; +use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant}; // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of @@ -969,11 +927,10 @@ async fn get_lsn_by_timestamp_handler( let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; @@ -1042,10 +999,10 @@ async fn get_timestamp_of_lsn_handler( .with_context(|| format!("Invalid LSN: {lsn_str:?}")) .map_err(ApiError::BadRequest)?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -1128,12 +1085,12 @@ async fn tenant_list_handler( ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state, gen)| TenantInfo { + .map(|(id, state, gen_)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen) + generation: (*gen_) .into() .expect("Tenants are always attached with a generation"), gc_blocking: None, @@ -1410,7 +1367,7 @@ async fn timeline_layer_scan_disposable_keys( }; let resident_layer = layer - .download_and_keep_resident() + .download_and_keep_resident(&ctx) .await .map_err(|err| match err { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -1478,6 +1435,7 @@ async fn timeline_download_heatmap_layers_handler( let desired_concurrency = parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; @@ -1485,6 +1443,7 @@ async fn timeline_download_heatmap_layers_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let max_concurrency = get_config(&request) .remote_storage_config @@ -1493,7 +1452,7 @@ async fn timeline_download_heatmap_layers_handler( .unwrap_or(DEFAULT_MAX_CONCURRENCY); let concurrency = std::cmp::min(max_concurrency, desired_concurrency); - timeline.start_heatmap_layers_download(concurrency).await?; + timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?; json_response(StatusCode::ACCEPTED, ()) } @@ -1532,8 +1491,9 @@ async fn layer_download_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let downloaded = timeline - .download_layer(&layer_name) + .download_layer(&layer_name, &ctx) .await .map_err(|e| match e { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -1670,9 +1630,8 @@ async fn block_or_unblock_gc( request: Request, block: bool, ) -> Result, ApiError> { - use crate::tenant::{ - remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, - }; + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2058,7 +2017,9 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + tracing::info!( + "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}" + ); remote_timeline_client::upload::time_travel_recover_tenant( &state.remote_storage, @@ -2396,7 +2357,8 @@ async fn timeline_checkpoint_handler( CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::Other(e) => ApiError::InternalServerError(e) + CompactionError::Other(e) => ApiError::InternalServerError(e), + CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -2429,7 +2391,8 @@ async fn timeline_download_remote_layers_handler_post( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - match timeline.spawn_download_all_remote_layers(body).await { + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + match timeline.spawn_download_all_remote_layers(body, &ctx).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), } @@ -2458,9 +2421,10 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor; use pageserver_api::models::detach_ancestor::AncestorDetached; + use crate::tenant::timeline::detach_ancestor; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2805,14 +2769,19 @@ async fn tenant_scan_remote_handler( .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", - index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + tracing::info!( + "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), + index_part.metadata.disk_consistent_lsn() + ); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { // This is normal for tenants that were created with multiple shards: they have an unsharded path // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. - tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + tracing::info!( + "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping" + ); continue; } Err(e) => { @@ -3431,7 +3400,9 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow anyhow::bail!("unexpected non-zero bytes after the tar archive"); } if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + anyhow::bail!( + "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive" + ); } Ok(()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index a73fa5cec8..6dd005de50 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -4,14 +4,22 @@ //! use std::path::{Path, PathBuf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{ + BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName, + pg_constants, +}; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; +use utils::lsn::Lsn; use wal_decoder::models::InterpretedWalRecord; use walkdir::WalkDir; @@ -20,16 +28,6 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::ControlFileData; -use postgres_ffi::DBState_DB_SHUTDOWNED; -use postgres_ffi::Oid; -use postgres_ffi::XLogFileName; -use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 491c9fb96c..6cfecef0cf 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,4 +1,5 @@ -use std::{num::NonZeroUsize, sync::Arc}; +use std::num::NonZeroUsize; +use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f43cd08cf7..02767055fb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -15,7 +15,8 @@ pub mod l0_flush; extern crate hyper0 as hyper; -use futures::{stream::FuturesUnordered, StreamExt}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; @@ -35,10 +36,8 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::{ - mgr::{BackgroundPurges, TenantManager}, - secondary, -}; +use tenant::mgr::{BackgroundPurges, TenantManager}; +use tenant::secondary; use tracing::{info, info_span}; /// Current storage format version @@ -350,9 +349,10 @@ async fn timed_after_cancellation( #[cfg(test)] mod timed_tests { - use super::timed; use std::time::Duration; + use super::timed; + #[tokio::test] async fn timed_completes_when_inner_future_completes() { // A future that completes on time should have its result returned diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e1c26b0684..eb8a9b8e24 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -10,11 +10,11 @@ use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ + Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::config::{ @@ -24,9 +24,8 @@ use pageserver_api::config::{ use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use pin_project_lite::pin_project; -use postgres_backend::{is_expected_io_error, QueryError}; +use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; - use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; @@ -35,12 +34,12 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::throttle::ThrottleResult; -use crate::tenant::Timeline; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user @@ -363,7 +362,7 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; - use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; #[derive(Clone, Copy)] @@ -722,7 +721,7 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy = Lazy::new(|| { }); pub(crate) mod initial_logical_size { - use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec}; use once_cell::sync::Lazy; pub(crate) struct StartCalculation(IntCounterVec); @@ -1105,12 +1104,17 @@ impl EvictionsWithLowResidenceDuration { // - future "drop panick => abort" // // so just nag: (the error has the labels) - tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + tracing::warn!( + "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}" + ); } Ok(()) => { // to help identify cases where we double-remove the same values, let's log all // deletions? - tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + tracing::info!( + "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", + self.data_source + ); } } } @@ -3574,12 +3578,10 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub mod tokio_epoll_uring { - use std::{ - collections::HashMap, - sync::{Arc, Mutex}, - }; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; - use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge}; + use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter}; use once_cell::sync::Lazy; /// Shared storage for tokio-epoll-uring thread local metrics. @@ -3588,7 +3590,9 @@ pub mod tokio_epoll_uring { let slots_submission_queue_depth = register_histogram!( "pageserver_tokio_epoll_uring_slots_submission_queue_depth", "The slots waiters queue depth of each tokio_epoll_uring system", - vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], ) .expect("failed to define a metric"); ThreadLocalMetricsStorage { @@ -3765,7 +3769,7 @@ pub mod tokio_epoll_uring { } pub(crate) mod tenant_throttling { - use metrics::{register_int_counter_vec, IntCounter}; + use metrics::{IntCounter, register_int_counter_vec}; use once_cell::sync::Lazy; use utils::shard::TenantShardId; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 45bf02362a..984dd125a9 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -67,23 +67,18 @@ //! mapping is automatically removed and the slot is marked free. //! -use std::{ - collections::{hash_map::Entry, HashMap}, - sync::{ - atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, - Arc, Weak, - }, - time::Duration, -}; +use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::Duration; use anyhow::Context; use once_cell::sync::OnceCell; -use crate::{ - context::RequestContext, - metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - virtual_file::{IoBufferMut, IoPageSlice}, -}; +use crate::context::RequestContext; +use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics}; +use crate::virtual_file::{IoBufferMut, IoPageSlice}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -168,11 +163,7 @@ impl Slot { let count_res = self.usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { - if val == 0 { - None - } else { - Some(val - 1) - } + if val == 0 { None } else { Some(val - 1) } }); match count_res { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7285697040..8972515163 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,7 +1,15 @@ //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -use anyhow::{bail, Context}; +use std::borrow::Cow; +use std::num::NonZeroUsize; +use std::os::fd::AsRawFd; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; +use std::{io, str}; + +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use futures::FutureExt; @@ -11,71 +19,57 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; -use pageserver_api::models::{self, TenantState}; +use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ - PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, PagestreamRequest, + PagestreamProtocolVersion, PagestreamRequest, TenantState, }; +use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; use postgres_backend::{ - is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, + AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; +use postgres_ffi::BLCKSZ; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; -use pq_proto::FeStartupPacket; -use pq_proto::{BeMessage, FeMessage, RowDescriptor}; -use std::borrow::Cow; -use std::io; -use std::num::NonZeroUsize; -use std::str; -use std::str::FromStr; -use std::sync::Arc; -use std::time::SystemTime; -use std::time::{Duration, Instant}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; use strum_macros::IntoStaticStr; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::io::{AsyncWriteExt, BufWriter}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::logging::warn_slow; +use utils::auth::{Claims, Scope, SwappableJwtAuth}; +use utils::failpoint_support; +use utils::id::{TenantId, TimelineId}; +use utils::logging::log_slow; +use utils::lsn::Lsn; +use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; -use utils::{ - auth::{Claims, Scope, SwappableJwtAuth}, - id::{TenantId, TimelineId}, - lsn::Lsn, - simple_rcu::RcuReadGuard, -}; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, SmgrOpTimer}; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; +use crate::metrics::{ + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, +}; use crate::pgdatadir_mapping::Version; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr::TaskKind; -use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; -use crate::tenant::mgr::ShardSelector; -use crate::tenant::mgr::TenantManager; -use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::span::{ + debug_assert_current_span_has_tenant_and_timeline_id, + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, +}; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; +use crate::tenant::mgr::{ + GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, +}; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; -use crate::tenant::GetTimelineError; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; +use crate::tenant::{GetTimelineError, PageReconstructError, Timeline}; use crate::{basebackup, timed_after_cancellation}; -use pageserver_api::key::rel_block_to_key; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -83,8 +77,8 @@ use std::os::fd::AsRawFd; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); -/// Threshold at which to log a warning about slow GetPage requests. -const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); +/// Threshold at which to log slow GetPage requests. +const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); /////////////////////////////////////////////////////////////////////////////// @@ -985,7 +979,7 @@ impl PageServerHandler { Ok(BatchedFeMessage::GetPage { span: _, shard: accum_shard, - pages: ref mut accum_pages, + pages: accum_pages, effective_request_lsn: accum_lsn, }), BatchedFeMessage::GetPage { @@ -1086,133 +1080,13 @@ impl PageServerHandler { batch }; - // invoke handler function - let (mut handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![self - .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::Nblocks { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![self - .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_get_page_at_lsn_request_batched( - &*shard.upgrade()?, - effective_request_lsn, - pages, - io_concurrency, - ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![self - .handle_db_size_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - BatchedFeMessage::GetSlruSegment { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![self - .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer)) - .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], - span, - ) - } - #[cfg(feature = "testing")] - BatchedFeMessage::Test { - span, - shard, - requests, - } => { - fail::fail_point!("ps::handle-pagerequest-message::test"); - ( - { - let npages = requests.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } - }; + // Dispatch the batch to the appropriate request handler. + let (mut handler_results, span) = log_slow( + batch.as_static_str(), + LOG_SLOW_GETPAGE_THRESHOLD, + self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx), + ) + .await?; // We purposefully don't count flush time into the smgr operation timer. // @@ -1298,6 +1172,8 @@ impl PageServerHandler { &response_msg.serialize(protocol_version), ))?; + failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel); + // what we want to do let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); @@ -1330,6 +1206,149 @@ impl PageServerHandler { Ok(()) } + /// Helper which dispatches a batched message to the appropriate handler. + /// Returns a vec of results, along with the extracted trace span. + async fn pagestream_dispatch_batched_message( + &mut self, + batch: BatchedFeMessage, + io_concurrency: IoConcurrency, + ctx: &RequestContext, + ) -> Result< + ( + Vec>, + Span, + ), + QueryError, + > { + Ok(match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![ + self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![ + self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &*shard.upgrade()?, + effective_request_lsn, + pages, + io_concurrency, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![ + self.handle_db_size_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![ + self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }) + } + /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. @@ -1473,19 +1492,16 @@ impl PageServerHandler { } }; - let result = warn_slow( - msg.as_static_str(), - WARN_SLOW_GETPAGE_THRESHOLD, - self.pagesteam_handle_batched_message( + let result = self + .pagesteam_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), &cancel, protocol_version, ctx, - ), - ) - .await; + ) + .await; match result { Ok(()) => {} Err(e) => break e, @@ -1649,17 +1665,13 @@ impl PageServerHandler { return Err(e); } }; - warn_slow( - batch.as_static_str(), - WARN_SLOW_GETPAGE_THRESHOLD, - self.pagesteam_handle_batched_message( - pgb_writer, - batch, - io_concurrency.clone(), - &cancel, - protocol_version, - &ctx, - ), + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + io_concurrency.clone(), + &cancel, + protocol_version, + &ctx, ) .await?; } @@ -2097,9 +2109,10 @@ impl PageServerHandler { set_tracing_field_shard_id(&timeline); if timeline.is_archived() == Some(true) { - // TODO after a grace period, turn this log line into a hard error - tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); - //return Err(QueryError::NotFound("timeline is archived".into())) + tracing::info!( + "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it." + ); + return Err(QueryError::NotFound("timeline is archived".into())); } let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d0e2dab042..c10dfb4542 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,6 +6,36 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::ops::{ControlFlow, Range}; + +use anyhow::{Context, ensure}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; +use pageserver_api::key::{ + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, + TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, + rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, + repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, +}; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::value::Value; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use strum::IntoEnumIterator; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, trace, warn}; +use utils::bin_ser::{BeSer, DeserializeError}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; + use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::RequestContext; @@ -19,37 +49,6 @@ use crate::span::{ }; use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; -use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes, BytesMut}; -use enum_map::Enum; -use itertools::Itertools; -use pageserver_api::key::{ - dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, - slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, - twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, - CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, -}; -use pageserver_api::key::{rel_tag_sparse_key, Key}; -use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; -use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; -use std::ops::ControlFlow; -use std::ops::Range; -use strum::IntoEnumIterator; -use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; -use utils::bin_ser::DeserializeError; -use utils::pausable_failpoint; -use utils::{bin_ser::BeSer, lsn::Lsn}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -327,16 +326,16 @@ impl Timeline { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { - PageReconstructError::Cancelled => { - PageReconstructError::Cancelled - } + PageReconstructError::Cancelled => PageReconstructError::Cancelled, - x @ PageReconstructError::Other(_) | - x @ PageReconstructError::AncestorLsnTimeout(_) | - x @ PageReconstructError::WalRedo(_) | - x @ PageReconstructError::MissingKey(_) => { - PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}")) - }, + x @ PageReconstructError::Other(_) + | x @ PageReconstructError::AncestorLsnTimeout(_) + | x @ PageReconstructError::WalRedo(_) + | x @ PageReconstructError::MissingKey(_) => { + PageReconstructError::Other(anyhow::anyhow!( + "there was more than one request for this key in the batch, error logged once: {x:?}" + )) + } }), }; @@ -355,23 +354,23 @@ impl Timeline { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { - GetVectoredError::Cancelled => { - Err(PageReconstructError::Cancelled) - } + GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled), // TODO: restructure get_vectored API to make this error per-key GetVectoredError::MissingKey(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more of the requested keys were missing: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::GetReadyAncestorError(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key - GetVectoredError::Other(err) => { - Err(PageReconstructError::Other( - anyhow::anyhow!("whole vectored get request failed: {err:?}"), - )) - } + GetVectoredError::Other(err) => Err(PageReconstructError::Other( + anyhow::anyhow!("whole vectored get request failed: {err:?}"), + )), // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) @@ -379,10 +378,7 @@ impl Timeline { // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!( - "batching oversized: {err:?}" - ) - .into()) + Err(anyhow::anyhow!("batching oversized: {err:?}").into()) } }; @@ -715,7 +711,10 @@ impl Timeline { { Ok(res) => res, Err(PageReconstructError::MissingKey(e)) => { - warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e); + warn!( + "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", + e + ); // Return that we didn't find any requests smaller than the LSN, and logging the error. return Ok(LsnForTimestamp::Past(min_lsn)); } @@ -1053,6 +1052,8 @@ impl Timeline { ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) }); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf)?; @@ -2464,10 +2465,12 @@ impl DatadirModification<'_> { // modifications before ingesting DB create operations, which are the only kind that reads // data pages during ingest. if cfg!(debug_assertions) { - assert!(!self - .pending_data_batch - .as_ref() - .is_some_and(|b| b.updates_key(&key))); + assert!( + !self + .pending_data_batch + .as_ref() + .is_some_and(|b| b.updates_key(&key)) + ); } } @@ -2666,15 +2669,14 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[cfg(test)] mod tests { use hex_literal::hex; - use pageserver_api::{models::ShardParameters, shard::ShardStripeSize}; - use utils::{ - id::TimelineId, - shard::{ShardCount, ShardNumber}, - }; + use pageserver_api::models::ShardParameters; + use pageserver_api::shard::ShardStripeSize; + use utils::id::TimelineId; + use utils::shard::{ShardCount, ShardNumber}; use super::*; - - use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::TenantHarness; /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline #[tokio::test] diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 4e8be58d58..85c2ed8499 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -73,11 +73,10 @@ impl Statvfs { pub mod mock { use camino::Utf8Path; + pub use pageserver_api::config::statvfs::mock::Behavior; use regex::Regex; use tracing::log::info; - pub use pageserver_api::config::statvfs::mock::Behavior; - pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -85,7 +84,7 @@ pub mod mock { Behavior::Success { blocksize, total_blocks, - ref name_filter, + name_filter, } => { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); @@ -134,7 +133,7 @@ pub mod mock { } Err(e) => { return Err(anyhow::Error::new(e) - .context(format!("get metadata of {:?}", entry.path()))) + .context(format!("get metadata of {:?}", entry.path()))); } }; total += m.len(); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index cc93a06ccd..0b71b2cf5b 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -40,15 +40,12 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; - use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - use utils::env; use utils::id::TimelineId; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 56718f5294..11d656eb25 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,149 +12,99 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, Context}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Debug, Display}; +use std::fs::File; +use std::future::Future; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant, SystemTime}; +use std::{fmt, fs}; + +use anyhow::{Context, bail}; use arc_swap::ArcSwap; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::NaiveDateTime; use enumset::EnumSet; -use futures::stream::FuturesUnordered; use futures::StreamExt; +use futures::stream::FuturesUnordered; use itertools::Itertools as _; +use once_cell::sync::Lazy; use pageserver_api::models; -use pageserver_api::models::CompactInfoResponse; -use pageserver_api::models::LsnLease; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::models::TimelineState; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::WalRedoManagerStatus; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeoutOrCancel; -use remote_timeline_client::manifest::{ - OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, +pub use pageserver_api::models::TenantState; +use pageserver_api::models::{ + CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + WalRedoManagerStatus, }; -use remote_timeline_client::UploadQueueNotReadyError; -use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; -use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; -use secondary::heatmap::HeatMapTenant; -use secondary::heatmap::HeatMapTimeline; -use std::collections::BTreeMap; -use std::fmt; -use std::future::Future; -use std::sync::atomic::AtomicBool; -use std::sync::Weak; -use std::time::SystemTime; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; +use remote_timeline_client::index::GcCompactionState; +use remote_timeline_client::manifest::{ + LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest, +}; +use remote_timeline_client::{ + FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, +}; +use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; -use timeline::compaction::CompactionOutcome; -use timeline::compaction::GcCompactionQueue; -use timeline::import_pgdata; -use timeline::offload::offload_timeline; -use timeline::offload::OffloadError; -use timeline::CompactFlags; -use timeline::CompactOptions; -use timeline::CompactionError; -use timeline::PreviousHeatmap; -use timeline::ShutdownMode; +use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::offload::{OffloadError, offload_timeline}; +use timeline::{ + CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, +}; use tokio::io::BufReader; -use tokio::sync::watch; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore, watch}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use upload_queue::NotInitialized; -use utils::backoff; use utils::circuit_breaker::CircuitBreaker; -use utils::completion; use utils::crashsafe::path_with_suffix_extension; -use utils::failpoint_support; -use utils::fs_ext; -use utils::pausable_failpoint; -use utils::sync::gate::Gate; -use utils::sync::gate::GateGuard; -use utils::timeout::timeout_cancellable; -use utils::timeout::TimeoutCancellableError; +use utils::sync::gate::{Gate, GateGuard}; +use utils::timeout::{TimeoutCancellableError, timeout_cancellable}; use utils::try_rcu::ArcSwapExt; -use utils::zstd::create_zst_tarball; -use utils::zstd::extract_zst_tarball; +use utils::zstd::{create_zst_tarball, extract_zst_tarball}; +use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; -use self::config::AttachedLocationConfig; -use self::config::AttachmentMode; -use self::config::LocationConf; -use self::config::TenantConf; +use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf}; use self::metadata::TimelineMetadata; -use self::mgr::GetActiveTenantError; -use self::mgr::GetTenantError; +use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; -use self::timeline::uninit::TimelineCreateGuard; -use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::UninitializedTimeline; -use self::timeline::EvictionTaskTenantState; -use self::timeline::GcCutoffs; -use self::timeline::TimelineDeleteProgress; -use self::timeline::TimelineResources; -use self::timeline::WaitLsnError; +use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline}; +use self::timeline::{ + EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, +}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::deletion_queue::DeletionQueueClient; -use crate::deletion_queue::DeletionQueueError; -use crate::import_datadir; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; -use crate::metrics::CONCURRENT_INITDBS; -use crate::metrics::INITDB_RUN_TIME; -use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME; -use crate::metrics::TENANT; use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, - TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, + BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, + TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::config::LocationMode; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationMode, TenantConfOpt}; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::remote_initdb_archive_path; -use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::INITDB_PATH; -use crate::tenant::storage_layer::DeltaLayer; -use crate::tenant::storage_layer::ImageLayer; -use crate::walingest::WalLagCooldown; -use crate::walredo; -use crate::InitializationOrder; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; -use std::fs; -use std::fs::File; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::{Duration, Instant}; - -use crate::span; +use crate::tenant::remote_timeline_client::{ + INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, +}; +use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; +use crate::walingest::WalLagCooldown; use crate::walredo::PostgresRedoManager; -use crate::TEMP_FILE_SUFFIX; -use once_cell::sync::Lazy; -pub use pageserver_api::models::TenantState; -use tokio::sync::Semaphore; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use utils::{ - crashsafe, - generation::Generation, - id::TimelineId, - lsn::{Lsn, RecordLsn}, -}; +use utils::crashsafe; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; pub mod block_io; @@ -183,9 +133,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; -pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -250,7 +200,9 @@ impl AttachedTenantConf { Ok(Self::new(location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { - anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") + anyhow::bail!( + "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode" + ) } } } @@ -464,7 +416,9 @@ impl WalredoManagerId { static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { - panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + panic!( + "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique" + ); } Self(id) } @@ -1168,6 +1122,7 @@ impl Tenant { resources, CreateTimelineCause::Load, idempotency.clone(), + index_part.gc_compaction.clone(), )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1198,12 +1153,15 @@ impl Tenant { let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); while let Some((tline, end_lsn)) = tline_ending_at { let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; - if !tline.is_previous_heatmap_active() { + // Another unearchived timeline might have generated a heatmap for this ancestor. + // If the current branch point greater than the previous one use the the heatmap + // we just generated - it should include more layers. + if !tline.should_keep_previous_heatmap(end_lsn) { tline .previous_heatmap .store(Some(Arc::new(unarchival_heatmap))); } else { - tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.") } match tline.ancestor_timeline() { @@ -1227,7 +1185,9 @@ impl Tenant { match cause { LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), LoadTimelineCause::ImportPgdata { .. } => { - unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + unreachable!( + "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" + ) } } let mut guard = self.timelines_creating.lock().unwrap(); @@ -1260,8 +1220,8 @@ impl Tenant { // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); } Entry::Vacant(v) => { v.insert(Arc::clone(&timeline)); @@ -1655,7 +1615,9 @@ impl Tenant { failpoint_support::sleep_millis_async!("before-attaching-tenant"); let Some(preload) = preload else { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); + anyhow::bail!( + "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624" + ); }; let mut offloaded_timeline_ids = HashSet::new(); @@ -1980,6 +1942,7 @@ impl Tenant { hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { heatmap: h, read_at: hs.1, + end_lsn: None, }) }); part_downloads.spawn( @@ -2039,7 +2002,7 @@ impl Tenant { remote_storage: GenericRemoteStorage, previous_heatmap: Option, cancel: CancellationToken, - ) -> impl Future { + ) -> impl Future + use<> { let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -2734,7 +2697,9 @@ impl Tenant { timeline } CreateTimelineResult::ImportSpawned(timeline) => { - info!("import task spawned, timeline will become visible and activated once the import is done"); + info!( + "import task spawned, timeline will become visible and activated once the import is done" + ); timeline } }; @@ -2780,7 +2745,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -2914,7 +2879,9 @@ impl Tenant { let index_part = match index_part { MaybeDeletedIndexPart::Deleted(_) => { // likely concurrent delete call, cplane should prevent this - anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + anyhow::bail!( + "index part says deleted but we are not done creating yet, this should not happen but" + ) } MaybeDeletedIndexPart::IndexPart(p) => p, }; @@ -3125,20 +3092,19 @@ impl Tenant { // If we're done compacting, check the scheduled GC compaction queue for more work. if outcome == CompactionOutcome::Done { - let queue = self - .scheduled_compaction_tasks - .lock() - .unwrap() - .get(&timeline.timeline_id) - .cloned(); - if let Some(queue) = queue { - outcome = queue - .iteration(cancel, ctx, &self.gc_block, &timeline) - .instrument( - info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), - ) - .await?; - } + let queue = { + let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .entry(timeline.timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())) + .clone() + }; + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) + .await?; } // If we're done compacting, offload the timeline if requested. @@ -3179,11 +3145,13 @@ impl Tenant { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { match err { + err if err.is_cancel() => {} CompactionError::ShuttingDown => (), // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { + // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. self.compaction_circuit_breaker .lock() .unwrap() @@ -3195,6 +3163,7 @@ impl Tenant { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } + CompactionError::AlreadyRunning(_) => {} } } @@ -3905,7 +3874,9 @@ where if !later.is_empty() { for (missing_id, orphan_ids) in later { for (orphan_id, _) in orphan_ids { - error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + error!( + "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded" + ); } } bail!("could not load tenant because some timelines are missing ancestors"); @@ -4150,6 +4121,7 @@ impl Tenant { resources: TimelineResources, cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, + gc_compaction_state: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -4181,6 +4153,7 @@ impl Tenant { state, self.attach_wal_lag_cooldown.clone(), create_idempotency, + gc_compaction_state, self.cancel.child_token(), ); @@ -4823,7 +4796,10 @@ impl Tenant { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { - tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn); + tracing::info!( + "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", + *applied_gc_cutoff_lsn + ); } else { src_timeline .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) @@ -4969,7 +4945,9 @@ impl Tenant { } // Idempotent <=> CreateTimelineIdempotency is identical (x, y) if x == y => { - info!("timeline already exists and idempotency matches, succeeding request"); + info!( + "timeline already exists and idempotency matches, succeeding request" + ); // fallthrough } (_, _) => { @@ -5051,7 +5029,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -5246,6 +5224,7 @@ impl Tenant { resources, CreateTimelineCause::Load, create_guard.idempotency.clone(), + None, ) .context("Failed to create timeline data structure")?; @@ -5255,7 +5234,9 @@ impl Tenant { .create_timeline_files(&create_guard.timeline_path) .await { - error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!( + "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}" + ); cleanup_timeline_directory(create_guard); return Err(e); } @@ -5620,20 +5601,19 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; + use hex_literal::hex; use once_cell::sync::OnceCell; + use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; + use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::ShardIndex; + use utils::id::TenantId; use utils::logging; + use super::*; use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; - use pageserver_api::key::Key; - use pageserver_api::record::NeonWalRecord; - - use super::*; - use hex_literal::hex; - use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -5914,34 +5894,34 @@ pub(crate) mod harness { mod tests { use std::collections::{BTreeMap, BTreeSet}; - use super::*; - use crate::keyspace::KeySpaceAccum; - use crate::tenant::harness::*; - use crate::tenant::timeline::CompactFlags; - use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + #[cfg(feature = "testing")] + use models::CompactLsnRange; + use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; - use rand::{thread_rng, Rng}; + use rand::{Rng, thread_rng}; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + #[cfg(feature = "testing")] + use timeline::GcInfo; + #[cfg(feature = "testing")] + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; - #[cfg(feature = "testing")] - use models::CompactLsnRange; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; - #[cfg(feature = "testing")] - use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - #[cfg(feature = "testing")] - use timeline::GcInfo; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::keyspace::KeySpaceAccum; + use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -6191,11 +6171,12 @@ mod tests { panic!("wrong error type") }; assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + assert!( + err.source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data") + ) } } @@ -6224,11 +6205,12 @@ mod tests { panic!("wrong error type"); }; assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC cutoff")); + assert!( + &err.source() + .unwrap() + .to_string() + .contains("is earlier than latest GC cutoff") + ); } } @@ -7537,10 +7519,12 @@ mod tests { } } - assert!(!harness - .conf - .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) - .exists()); + assert!( + !harness + .conf + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) + .exists() + ); Ok(()) } @@ -7741,7 +7725,10 @@ mod tests { let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + assert!( + after_num_l0_delta_files < before_num_l0_delta_files, + "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}" + ); assert_eq!( tline.get(test_key, lsn, &ctx).await?, @@ -7908,7 +7895,10 @@ mod tests { let (_, after_delta_file_accessed) = scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) .await?; - assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + assert!( + after_delta_file_accessed < before_delta_file_accessed, + "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}" + ); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( after_delta_file_accessed <= 2, @@ -7962,10 +7952,12 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); - assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) - .await - .unwrap_err() - .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); assert!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) .await diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 7b55df52a5..b16a88eaa4 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -14,6 +14,9 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use std::cmp::min; +use std::io::{Error, ErrorKind}; + use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; @@ -24,10 +27,8 @@ use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; -use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; -use std::cmp::min; -use std::io::{Error, ErrorKind}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; #[derive(Copy, Clone, Debug)] pub struct CompressionInfo { @@ -414,12 +415,15 @@ impl BlobWriter { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::block_io::BlockReaderRef; + async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { round_trip_test_compressed::(blobs, false).await } @@ -486,7 +490,7 @@ pub(crate) mod tests { pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); - (0..len).map(|_| rng.gen()).collect::<_>() + (0..len).map(|_| rng.r#gen()).collect::<_>() } #[tokio::test] @@ -544,9 +548,9 @@ pub(crate) mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { - let mut sz: u16 = rng.gen(); + let mut sz: u16 = rng.r#gen(); // Make 50% of the arrays small - if rng.gen() { + if rng.r#gen() { sz &= 63; } random_array(sz.into()) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 990211f80a..66c586daff 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,14 +2,16 @@ //! Low-level Block-oriented I/O functions //! +use std::ops::Deref; + +use bytes::Bytes; + use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult}; #[cfg(test)] use crate::virtual_file::IoBufferMut; use crate::virtual_file::VirtualFile; -use bytes::Bytes; -use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index f98356242e..d5b979ab2a 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -63,9 +63,9 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", - layer, other_layer - ); + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); return Some(err); } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index ab4c4c935d..334fb04604 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,16 +8,17 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use std::num::NonZeroU64; +use std::time::Duration; + pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::CompactionAlgorithmSettings; -use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, TenantConfigPatch}; +use pageserver_api::models::{ + self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch, +}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::num::NonZeroU64; -use std::time::Duration; use utils::generation::Generation; use utils::postgres_client::PostgresClientProtocol; @@ -739,9 +740,10 @@ impl From for models::TenantConfig { #[cfg(test)] mod tests { - use super::*; use models::TenantConfig; + use super::*; + #[test] fn de_serializing_pageserver_config_omits_empty_values() { let small_conf = TenantConfOpt { diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index bb9df020b5..73c105b34e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,27 +18,23 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use std::cmp::Ordering; +use std::iter::Rev; +use std::ops::{Range, RangeInclusive}; +use std::{io, result}; + use async_stream::try_stream; -use byteorder::{ReadBytesExt, BE}; +use byteorder::{BE, ReadBytesExt}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; use futures::{Stream, StreamExt}; use hex; -use std::{ - cmp::Ordering, - io, - iter::Rev, - ops::{Range, RangeInclusive}, - result, -}; use thiserror::Error; use tracing::error; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::block_io::{BlockReader, BlockWriter}, -}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::block_io::{BlockReader, BlockWriter}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; @@ -833,12 +829,14 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; - use rand::Rng; use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; + use rand::Rng; + + use super::*; + use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; + #[derive(Clone, Default)] pub(crate) struct TestDisk { blocks: Vec, @@ -1115,7 +1113,7 @@ pub(crate) mod tests { // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { - let key_int = rand::thread_rng().gen::(); + let key_int = rand::thread_rng().r#gen::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index ba79672bc7..cb25fa6185 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,6 +1,17 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use std::io; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; + +use camino::Utf8PathBuf; +use num_traits::Num; +use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; +use utils::id::TimelineId; + use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; @@ -9,17 +20,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use camino::Utf8PathBuf; -use num_traits::Num; -use pageserver_api::shard::TenantShardId; -use tokio_epoll_uring::{BoundedBuf, Slice}; -use tracing::error; - -use std::io; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use utils::id::TimelineId; +use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io}; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, @@ -319,13 +320,14 @@ pub fn is_ephemeral_file(filename: &str) -> bool { #[cfg(test)] mod tests { + use std::fs; + use std::str::FromStr; + use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use std::fs; - use std::str::FromStr; fn harness( test_name: &str, diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index af73acb2be..7aa920c953 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TimelineId; diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs index c805aafeab..7a7d6d19cb 100644 --- a/pageserver/src/tenant/gc_result.rs +++ b/pageserver/src/tenant/gc_result.rs @@ -1,8 +1,9 @@ -use anyhow::Result; -use serde::Serialize; use std::ops::AddAssign; use std::time::Duration; +use anyhow::Result; +use serde::Serialize; + /// /// Result of performing GC /// diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index a69cce932e..59f5a6bd90 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -46,24 +46,24 @@ mod historic_layer_coverage; mod layer_coverage; -use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; -use anyhow::Result; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; -use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; + +use anyhow::Result; +use historic_layer_coverage::BufferedHistoricLayerCoverage; +pub use historic_layer_coverage::LayerKey; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use tokio::sync::watch; use utils::lsn::Lsn; -use historic_layer_coverage::BufferedHistoricLayerCoverage; -pub use historic_layer_coverage::LayerKey; - use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; +use crate::context::RequestContext; +use crate::keyspace::KeyPartitioning; +use crate::tenant::storage_layer::InMemoryLayer; /// /// LayerMap tracks what layers exist on a timeline. @@ -1066,18 +1066,17 @@ impl LayerMap { #[cfg(test)] mod tests { - use crate::tenant::{storage_layer::LayerName, IndexPart}; - use pageserver_api::{ - key::DBDIR_KEY, - keyspace::{KeySpace, KeySpaceRandomAccum}, - }; - use std::{collections::HashMap, path::PathBuf}; - use utils::{ - id::{TenantId, TimelineId}, - shard::TenantShardId, - }; + use std::collections::HashMap; + use std::path::PathBuf; + + use pageserver_api::key::DBDIR_KEY; + use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; + use utils::id::{TenantId, TimelineId}; + use utils::shard::TenantShardId; use super::*; + use crate::tenant::IndexPart; + use crate::tenant::storage_layer::LayerName; #[derive(Clone)] struct LayerDesc { @@ -1417,9 +1416,11 @@ mod tests { assert!(!shadow.ranges.is_empty()); // At least some layers should be marked covered - assert!(layer_visibilities - .iter() - .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + assert!( + layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered)) + ); let layer_visibilities = layer_visibilities.into_iter().collect::>(); diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 136f68bc36..f8bec48886 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -3,9 +3,8 @@ use std::ops::Range; use tracing::info; -use crate::tenant::storage_layer::PersistentLayerDesc; - use super::layer_coverage::LayerCoverageTuple; +use crate::tenant::storage_layer::PersistentLayerDesc; /// Layers in this module are identified and indexed by this data. /// diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 15c6955260..77f9a3579d 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -19,8 +19,9 @@ use anyhow::ensure; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; +use utils::bin_ser::{BeSer, SerializeError}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -345,9 +346,10 @@ impl TimelineMetadata { } pub(crate) mod modern_serde { - use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; use serde::{Deserialize, Serialize}; + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 22ee560dbf..003f84e640 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,34 +1,42 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, -}; -use pageserver_api::upcall_api::ReAttachResponseTenant; -use rand::{distributions::Alphanumeric, Rng}; -use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use sysinfo::SystemExt; -use tokio::fs; use anyhow::Context; +use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; +use rand::Rng; +use rand::distributions::Alphanumeric; +use remote_storage::TimeoutOrCancel; +use sysinfo::SystemExt; +use tokio::fs; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; - +use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext::PathExt; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, completion, crashsafe}; +use super::remote_timeline_client::remote_tenant_path; +use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ @@ -37,7 +45,7 @@ use crate::controller_upcall_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -48,16 +56,6 @@ use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Ten use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; -use utils::crashsafe::path_with_suffix_extension; -use utils::fs_ext::PathExt; -use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; - -use super::remote_timeline_client::remote_tenant_path; -use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; -use super::{GlobalShutDown, TenantSharedResources}; - /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service /// reads and ingest WAL. @@ -140,7 +138,7 @@ impl TenantStartupMode { /// If this returns None, the re-attach struct is in an invalid state and /// should be ignored in the response. fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { - match (rart.mode, rart.gen) { + match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), (LocationConfigMode::AttachedMulti, Some(g)) => { @@ -376,7 +374,7 @@ async fn init_load_generations( TenantStartupMode::Attached((_mode, generation)) => Some(generation), TenantStartupMode::Secondary => None, } - .map(|gen| (*id, *gen)) + .map(|gen_| (*id, *gen_)) }) .collect(); resources.deletion_queue_client.recover(attached_tenants)?; @@ -502,7 +500,9 @@ pub async fn init_tenant_mgr( .total_memory(); let max_ephemeral_layer_bytes = conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); - tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + tracing::info!( + "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory" + ); inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( max_ephemeral_layer_bytes, std::sync::atomic::Ordering::Relaxed, @@ -700,10 +700,11 @@ fn tenant_spawn( // to avoid impacting prod runtime performance. assert!(!crate::is_temporary(tenant_path)); debug_assert!(tenant_path.is_dir()); - debug_assert!(conf - .tenant_location_config_path(&tenant_shard_id) - .try_exists() - .unwrap()); + debug_assert!( + conf.tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap() + ); Tenant::spawn( conf, @@ -791,7 +792,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { - error!("already shutting down, this function isn't supposed to be called more than once"); + error!( + "already shutting down, this function isn't supposed to be called more than once" + ); return; } } @@ -1016,9 +1019,9 @@ impl TenantManager { Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( - timeout_ms = flush_timeout.as_millis(), - "Timed out waiting for flush to remote storage, proceeding anyway." - ) + timeout_ms = flush_timeout.as_millis(), + "Timed out waiting for flush to remote storage, proceeding anyway." + ) } } } @@ -1194,7 +1197,9 @@ impl TenantManager { } TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); - info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + info!( + "Shutting down just-spawned tenant, because tenant manager is shut down" + ); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); @@ -1784,7 +1789,7 @@ impl TenantManager { _ => { return Err(anyhow::anyhow!(e).context(format!( "Hard linking {relative_layer} into {child_prefix}" - ))) + ))); } } } @@ -2025,8 +2030,8 @@ impl TenantManager { .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(|e| { - use pageserver_api::models::TenantState; use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + use pageserver_api::models::TenantState; match e { Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { Error::ShuttingDown @@ -2089,7 +2094,7 @@ impl TenantManager { match selector { ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return ShardResolveResult::Found(tenant.clone()) + return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { // First slot we see for this tenant, calculate the expected shard number @@ -2486,7 +2491,7 @@ impl SlotGuard { TenantsMap::Initializing => { return Err(TenantSlotUpsertError::MapState( TenantMapError::StillInitializing, - )) + )); } TenantsMap::ShuttingDown(_) => { return Err(TenantSlotUpsertError::ShuttingDown(( @@ -2815,21 +2820,22 @@ where } } -use { - crate::tenant::gc_result::GcResult, http_utils::error::ApiError, - pageserver_api::models::TimelineGcRequest, -}; +use http_utils::error::ApiError; +use pageserver_api::models::TimelineGcRequest; + +use crate::tenant::gc_result::GcResult; #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use tracing::Instrument; + use super::super::harness::TenantHarness; + use super::TenantsMap; use crate::tenant::mgr::TenantSlot; - use super::{super::harness::TenantHarness, TenantsMap}; - #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 713efbb9a4..4ba5844fea 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -179,77 +179,64 @@ pub mod index; pub mod manifest; pub(crate) mod upload; -use anyhow::Context; -use camino::Utf8Path; -use chrono::{NaiveDateTime, Utc}; - -pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::shard::{ShardIndex, TenantShardId}; -use regex::Regex; -use scopeguard::ScopeGuard; -use tokio_util::sync::CancellationToken; -use utils::backoff::{ - self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::pausable_failpoint; -use utils::shard::ShardNumber; - use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; +use anyhow::Context; +use camino::Utf8Path; +use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::{ + download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, + list_remote_tenant_shards, list_remote_timelines, +}; +use index::GcCompactionState; +pub(crate) use index::LayerFileMetadata; +use pageserver_api::models::TimelineArchivalState; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use regex::Regex; use remote_storage::{ DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, }; -use std::ops::DerefMut; -use tracing::{debug, error, info, instrument, warn}; -use tracing::{info_span, Instrument}; -use utils::lsn::Lsn; - -use crate::context::RequestContext; -use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::metrics::{ - MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, - REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +pub(crate) use upload::upload_initdb_dir; +use utils::backoff::{ + self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; -use crate::task_mgr::shutdown_token; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::remote_timeline_client::download::download_retry; -use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; -use crate::tenant::TIMELINES_SEGMENT_NAME; -use crate::{ - config::PageServerConf, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::TimelineMetadata, - tenant::upload_queue::{ - UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, - }, - TENANT_HEATMAP_BASENAME, -}; - use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use utils::shard::ShardNumber; use self::index::IndexPart; - use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; - -pub(crate) use download::{ - download_index_part, download_tenant_manifest, is_temp_download_file, - list_remote_tenant_shards, list_remote_timelines, +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; +use crate::metrics::{ + MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + RemoteTimelineClientMetricsCallTrackSize, }; -pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::upload_initdb_dir; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::download::download_retry; +use crate::tenant::storage_layer::AsLayerDesc; +use crate::tenant::upload_queue::{ + Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, + UploadQueueStoppedDeletable, UploadTask, +}; +use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id}; +use crate::{TENANT_HEATMAP_BASENAME, task_mgr}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -913,6 +900,18 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( + self: &Arc, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.gc_compaction = Some(gc_compaction_state); + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1078,7 +1077,11 @@ impl RemoteTimelineClient { if !wanted(x) && wanted(y) { // this could be avoided by having external in-memory synchronization, like // timeline detach ancestor - warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + warn!( + ?reason, + op = "insert", + "unexpected: two racing processes to enable and disable a gc blocking reason" + ); } // at this point, the metadata must always show that there is a parent @@ -1132,7 +1135,11 @@ impl RemoteTimelineClient { (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), (x, y) => { if !wanted(x) && wanted(y) { - warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + warn!( + ?reason, + op = "remove", + "unexpected: two racing processes to enable and disable a gc blocking reason (remove)" + ); } upload_queue.dirty.gc_blocking = @@ -1274,12 +1281,14 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, metadata) in &with_metadata { - let gen = metadata.generation; - if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) { - if unexpected == gen { + let gen_ = metadata.generation; + if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) { + if unexpected == gen_ { tracing::error!("{name} was unlinked twice with same generation"); } else { - tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}"); + tracing::error!( + "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}" + ); } } } @@ -1341,11 +1350,11 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, meta) in &with_metadata { - let gen = meta.generation; + let gen_ = meta.generation; match upload_queue.dangling_files.remove(name) { - Some(same) if same == gen => { /* expected */ } + Some(same) if same == gen_ => { /* expected */ } Some(other) => { - tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}"); + tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}"); } None => { tracing::error!("{name} was unlinked but was not dangling"); @@ -1442,7 +1451,9 @@ impl RemoteTimelineClient { // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. let sg = scopeguard::guard((), |_| { - tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error") + tracing::error!( + "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error" + ) }); let fut = { @@ -1458,7 +1469,7 @@ impl RemoteTimelineClient { scopeguard::ScopeGuard::into_inner(sg); return; } - UploadQueue::Initialized(ref mut init) => init, + UploadQueue::Initialized(init) => init, }; // if the queue is already stuck due to a shutdown operation which was cancelled, then @@ -1818,7 +1829,9 @@ impl RemoteTimelineClient { .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) + .filter_map(|o| { + parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_)) + }) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -2010,7 +2023,7 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + UploadOp::UploadLayer(layer, layer_metadata, mode) => { // TODO: check if this mechanism can be removed now that can_bypass() performs // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { @@ -2100,7 +2113,7 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata { ref uploaded } => { + UploadOp::UploadMetadata { uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, @@ -2216,11 +2229,11 @@ impl RemoteTimelineClient { let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { - UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(_stopped) => { - None - }, - UploadQueue::Initialized(qi) => { Some(qi) } + UploadQueue::Uninitialized => panic!( + "callers are responsible for ensuring this is only called on an initialized queue" + ), + UploadQueue::Stopped(_stopped) => None, + UploadQueue::Initialized(qi) => Some(qi), }; let upload_queue = match upload_queue { @@ -2242,7 +2255,11 @@ impl RemoteTimelineClient { let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); let monotone = is_later || last_updater.is_none(); - assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + assert!( + monotone, + "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", + task.task_id + ); // not taking ownership is wasteful upload_queue.clean.0.clone_from(uploaded); @@ -2641,20 +2658,16 @@ pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option #[cfg(test)] mod tests { - use super::*; - use crate::{ - context::RequestContext, - tenant::{ - config::AttachmentMode, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::layer::local_layer_path, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; - use std::collections::HashSet; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::AttachmentMode; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::{Tenant, Timeline}; + pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b4d45dca75..92be2145ce 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,41 +8,39 @@ use std::future::Future; use std::str::FromStr; use std::time::SystemTime; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::backoff; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; +use utils::{backoff, pausable_failpoint}; +use super::index::{IndexPart, LayerFileMetadata}; +use super::manifest::TenantManifest; +use super::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, + parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, remote_tenant_manifest_path, + remote_tenant_manifest_prefix, remote_tenant_path, +}; +use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; +use crate::tenant::Generation; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; -use crate::tenant::Generation; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use remote_storage::{ - DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, -}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; -use utils::pausable_failpoint; - -use super::index::{IndexPart, LayerFileMetadata}; -use super::manifest::TenantManifest; -use super::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, - remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, - remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, INITDB_PATH, -}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that @@ -207,9 +205,9 @@ async fn download_object( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io; - use crate::virtual_file::IoBufferMut; use std::sync::Arc; + + use crate::virtual_file::{IoBufferMut, owned_buffers_io}; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index b8b18005fd..ceaed58bbd 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,16 +7,16 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; +use utils::id::TimelineId; +use utils::lsn::Lsn; use super::is_same_remote_layer_path; +use crate::tenant::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; -use crate::tenant::Generation; -use pageserver_api::shard::ShardIndex; -use utils::id::TimelineId; -use utils::lsn::Lsn; /// In-memory representation of an `index_part.json` file /// @@ -85,9 +85,36 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, - /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated. + /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field. #[serde(skip_serializing_if = "Option::is_none", default)] - pub(crate) l2_lsn: Option, + l2_lsn: Option, + + /// State for the garbage-collecting compaction pass. + /// + /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside + /// the PITR window and not needed by child timelines. + /// + /// A commonly used synonym for this compaction pass is + /// "bottommost-compaction" because the affected LSN range + /// is the "bottom" of the (key,lsn) map. + /// + /// Gc-compaction is a quite expensive operation; that's why we use + /// trigger condition. + /// This field here holds the state pertaining to that trigger condition + /// and (in future) to the progress of the gc-compaction, so that it's + /// resumable across restarts & migrations. + /// + /// Note that the underlying algorithm is _also_ called `gc-compaction` + /// in most places & design docs; but in fact it is more flexible than + /// just the specific use case here; it needs a new name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_compaction: Option, +} + +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct GcCompactionState { + /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN. + pub(crate) last_completed_lsn: Lsn, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -123,10 +150,11 @@ impl IndexPart { /// - 10: +import_pgdata /// - 11: +rel_size_migration /// - 12: +l2_lsn - const LATEST_VERSION: usize = 12; + /// - 13: +gc_compaction + const LATEST_VERSION: usize = 13; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -144,6 +172,7 @@ impl IndexPart { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, } } @@ -406,10 +435,12 @@ impl GcBlocking { #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use utils::id::TimelineId; + use super::*; + #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -450,6 +481,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -497,6 +529,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -545,6 +578,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -596,6 +630,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -642,6 +677,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -691,6 +727,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -745,6 +782,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -804,6 +842,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -864,6 +903,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -929,6 +969,7 @@ mod tests { import_pgdata: None, rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1007,6 +1048,7 @@ mod tests { }))), rel_size_migration: None, l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1086,6 +1128,7 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1093,7 +1136,7 @@ mod tests { } #[test] - fn v12_l2_lsn_is_parsed() { + fn v12_v13_l2_gc_ompaction_is_parsed() { let example = r#"{ "version": 12, "layer_metadata":{ @@ -1124,7 +1167,10 @@ mod tests { } }, "rel_size_migration": "legacy", - "l2_lsn": "0/16960E8" + "l2_lsn": "0/16960E8", + "gc_compaction": { + "last_completed_lsn": "0/16960E8" + } }"#; let expected = IndexPart { @@ -1166,6 +1212,9 @@ mod tests { }))), rel_size_migration: Some(RelSizeMigration::Legacy), l2_lsn: Some("0/16960E8".parse::().unwrap()), + gc_compaction: Some(GcCompactionState { + last_completed_lsn: "0/16960E8".parse::().unwrap(), + }), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 2029847a12..543ccc219d 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,6 +1,7 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Tenant-shard scoped manifest #[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index af4dbbbfb6..7d9f47665a 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,28 +1,28 @@ //! Helper functions to upload files to remote storage with a RemoteStorage -use anyhow::{bail, Context}; +use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; + +use anyhow::{Context, bail}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::{ErrorKind, SeekFrom}; -use std::time::SystemTime; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; +use super::Generation; use super::index::IndexPart; use super::manifest::TenantManifest; -use super::Generation; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, }; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; -use utils::id::{TenantId, TimelineId}; - -use tracing::info; /// Serializes and uploads the given index part data to the remote storage. pub(crate) async fn upload_index_part( @@ -134,7 +134,9 @@ pub(super) async fn upload_timeline_layer<'a>( .len(); if metadata_size != fs_size { - bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!( + "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ); } let fs_size = usize::try_from(fs_size) diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 4bc208331b..8f8622c796 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -3,40 +3,31 @@ pub mod heatmap; mod heatmap_uploader; mod scheduler; -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; -use crate::{ - context::RequestContext, - disk_usage_eviction_task::DiskUsageEvictionInfo, - metrics::SECONDARY_HEATMAP_TOTAL_SIZE, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, -}; - -use self::{ - downloader::{downloader_task, SecondaryDetail}, - heatmap_uploader::heatmap_uploader_task, -}; - -use super::{ - config::{SecondaryLocationConfig, TenantConfOpt}, - mgr::TenantManager, - span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerName, - GetTenantError, -}; - -use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; use metrics::UIntGauge; -use pageserver_api::{ - models, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use remote_storage::GenericRemoteStorage; - use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; +use utils::completion::Barrier; +use utils::id::TimelineId; +use utils::sync::gate::Gate; + +use self::downloader::{SecondaryDetail, downloader_task}; +use self::heatmap_uploader::heatmap_uploader_task; +use super::GetTenantError; +use super::config::{SecondaryLocationConfig, TenantConfOpt}; +use super::mgr::TenantManager; +use super::span::debug_assert_current_span_has_tenant_id; +use super::storage_layer::LayerName; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::DiskUsageEvictionInfo; +use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; enum DownloadCommand { Download(TenantShardId), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 2e8c3946bd..a13b9323ac 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1,47 +1,8 @@ -use std::{ - collections::{HashMap, HashSet}, - pin::Pin, - str::FromStr, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - disk_usage_eviction_task::{ - finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, - }, - metrics::SECONDARY_MODE, - tenant::{ - config::SecondaryLocationConfig, - debug_assert_current_span_has_tenant_and_timeline_id, - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - }, - span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - }, - virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - TEMP_FILE_SUFFIX, -}; - -use super::{ - heatmap::HeatMapLayer, - scheduler::{ - self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, - TenantBackgroundJobs, - }, - GetTenantError, SecondaryTenant, SecondaryTenantError, -}; - -use crate::tenant::{ - mgr::TenantManager, - remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, -}; +use std::collections::{HashMap, HashSet}; +use std::pin::Pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; @@ -50,18 +11,43 @@ use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; - use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, warn, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, pausable_failpoint, serde_system_time, -}; +use tracing::{Instrument, info_span, instrument, warn}; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::TimelineId; +use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time}; -use super::{ - heatmap::{HeatMapTenant, HeatMapTimeline}, - CommandRequest, DownloadCommand, +use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline}; +use super::scheduler::{ + self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{ + CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError, +}; +use crate::TEMP_FILE_SUFFIX; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::{ + DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32, +}; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::config::SecondaryLocationConfig; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::download::download_layer_file; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file, + remote_heatmap_path, +}; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint}; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// For each tenant, default period for how long must have passed since the last download_tenant call before /// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 0fa10ca294..4a938e9095 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,11 +1,13 @@ -use std::{collections::HashMap, time::SystemTime}; - -use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; +use std::collections::HashMap; +use std::time::SystemTime; use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; +use serde_with::{DisplayFromStr, TimestampSeconds, serde_as}; +use utils::generation::Generation; +use utils::id::TimelineId; -use utils::{generation::Generation, id::TimelineId}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::LayerName; #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTenant { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index d72c337369..3375714a66 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,42 +1,33 @@ -use std::{ - collections::HashMap, - pin::Pin, - sync::{Arc, Weak}, - time::{Duration, Instant}, -}; - -use crate::{ - metrics::SECONDARY_MODE, - tenant::{ - config::AttachmentMode, - mgr::{GetTenantError, TenantManager}, - remote_timeline_client::remote_heatmap_path, - span::debug_assert_current_span_has_tenant_id, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - Tenant, - }, - virtual_file::VirtualFile, - TEMP_FILE_SUFFIX, -}; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::{Arc, Weak}; +use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; - -use super::{ - heatmap::HeatMapTenant, - scheduler::{ - self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, - TenantBackgroundJobs, - }, - CommandRequest, SecondaryTenantError, UploadCommand, -}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, - yielding_loop::yielding_loop, +use tracing::{Instrument, info_span, instrument}; +use utils::backoff; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::yielding_loop::yielding_loop; + +use super::heatmap::HeatMapTenant; +use super::scheduler::{ + self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{CommandRequest, SecondaryTenantError, UploadCommand}; +use crate::TEMP_FILE_SUFFIX; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::Tenant; +use crate::tenant::config::AttachmentMode; +use crate::tenant::mgr::{GetTenantError, TenantManager}; +use crate::tenant::remote_timeline_client::remote_heatmap_path; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::VirtualFile; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index e963c722b9..f948f9114f 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,16 +1,15 @@ -use futures::Future; -use rand::Rng; -use std::{ - collections::HashMap, - marker::PhantomData, - pin::Pin, - time::{Duration, Instant}, -}; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::pin::Pin; +use std::time::{Duration, Instant}; +use futures::Future; use pageserver_api::shard::TenantShardId; +use rand::Rng; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use utils::{completion::Barrier, yielding_loop::yielding_loop}; +use utils::completion::Barrier; +use utils::yielding_loop::yielding_loop; use super::{CommandRequest, CommandResponse, SecondaryTenantError}; diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 1e84a9d9dc..ed6b351c75 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -4,21 +4,18 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tenant_size_model::svg::SvgBranchKind; -use tokio::sync::oneshot::error::RecvError; +use tenant_size_model::{Segment, StorageModel}; use tokio::sync::Semaphore; +use tokio::sync::oneshot::error::RecvError; use tokio_util::sync::CancellationToken; - -use crate::context::RequestContext; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; - -use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::{MaybeOffloaded, Timeline}; +use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; -use tracing::*; - -use tenant_size_model::{Segment, StorageModel}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; +use crate::context::RequestContext; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::tenant::{MaybeOffloaded, Timeline}; /// Inputs to the actual tenant sizing model /// @@ -498,7 +495,9 @@ async fn fill_logical_sizes( } Err(join_error) => { // cannot really do anything, as this panic is likely a bug - error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + error!( + "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}" + ); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(join_error) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f9f843ef6b..7f313f46a2 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,42 +10,39 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; -use crate::config::PageServerConf; -use crate::context::{AccessStatsBehavior, RequestContext}; -use bytes::Bytes; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::future::Future; use std::ops::Range; use std::pin::Pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; +use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use tracing::{trace, Instrument}; -use utils::sync::gate::GateGuard; - -use utils::lsn::Lsn; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; +use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; +pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; - -pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::value::Value; +use tracing::{Instrument, trace}; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; - -use super::timeline::{GetVectoredError, ReadPath}; use super::PageReconstructError; +use super::timeline::{GetVectoredError, ReadPath}; +use crate::config::PageServerConf; +use crate::context::{AccessStatsBehavior, RequestContext}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -510,6 +507,7 @@ impl IoConcurrency { #[cfg(test)] pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { use std::ops::{Deref, DerefMut}; + use tracing::info; use utils::sync::gate::Gate; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 7da51c27df..fd50e4805d 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -1,17 +1,22 @@ -use std::{future::Future, ops::Range, sync::Arc}; +use std::future::Future; +use std::ops::Range; +use std::sync::Arc; use bytes::Bytes; -use pageserver_api::key::{Key, KEY_SIZE}; -use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; - -use crate::tenant::storage_layer::Layer; -use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::shard::TenantShardId; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, }; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::tenant::Timeline; +use crate::tenant::storage_layer::Layer; pub(crate) enum BatchWriterResult { Produced(ResidentLayer), @@ -423,15 +428,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::AsLayerDesc, - }, - DEFAULT_PG_VERSION, - }; - use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::AsLayerDesc; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 7ba0e3679f..83ac6aab51 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -27,6 +27,38 @@ //! "values" part. The actual page images and WAL records are stored in the //! "values" part. //! +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::fs::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBuf; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -42,43 +74,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::fs::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBuf; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; +use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file @@ -1130,10 +1127,11 @@ impl DeltaLayerInner { until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { + use futures::stream::TryStreamExt; + use crate::tenant::vectored_blob_io::{ BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; - use futures::stream::TryStreamExt; #[derive(Debug)] enum Item { @@ -1599,23 +1597,21 @@ impl DeltaLayerIterator<'_> { pub(crate) mod test { use std::collections::BTreeMap; + use bytes::Bytes; use itertools::MinMaxResult; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use pageserver_api::value::Value; use rand::RngCore; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use super::*; - use crate::tenant::harness::TIMELINE_ID; + use crate::DEFAULT_PG_VERSION; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::disk_btree::tests::TestDisk; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{Tenant, Timeline}; - use crate::{ - context::DownloadBehavior, - task_mgr::TaskKind, - tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, - DEFAULT_PG_VERSION, - }; - use bytes::Bytes; - use pageserver_api::value::Value; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -2087,7 +2083,7 @@ pub(crate) mod test { .await .unwrap(); - let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap(); new_layer .copy_delta_prefix(&mut writer, truncate_at, ctx) diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8660be1fcc..8d172a1c19 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -1,18 +1,14 @@ -use std::{ops::Range, sync::Arc}; +use std::ops::Range; +use std::sync::Arc; use anyhow::bail; -use pageserver_api::{ - key::Key, - keyspace::{KeySpace, SparseKeySpace}, -}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; +use pageserver_api::value::Value; use utils::lsn::Lsn; -use pageserver_api::value::Value; - -use super::{ - merge_iterator::{MergeIterator, MergeIteratorItem}, - PersistentLayerKey, -}; +use super::PersistentLayerKey; +use super::merge_iterator::{MergeIterator, MergeIteratorItem}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// @@ -98,19 +94,14 @@ impl<'a> FilterIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::produce_delta_layer, - }, - DEFAULT_PG_VERSION, - }; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer; async fn assert_filter_iter_equal( filter_iter: &mut FilterIterator<'_>, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index dc611bd6e1..0db9e8c845 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,6 +25,39 @@ //! layer, and offsets to the other parts. The "index" is a B-tree, //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::io::SeekFrom; +use std::ops::Range; +use std::os::unix::prelude::FileExt; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use hex; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use rand::Rng; +use rand::distributions::Alphanumeric; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_stream::StreamExt; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -39,43 +72,8 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use hex; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; -use rand::{distributions::Alphanumeric, Rng}; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, VecDeque}; -use std::fs::File; -use std::io::SeekFrom; -use std::ops::Range; -use std::os::unix::prelude::FileExt; -use std::str::FromStr; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_stream::StreamExt; -use tracing::*; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, - ValuesReconstructState, -}; /// /// Header stored in the beginning of the file @@ -1135,34 +1133,26 @@ impl ImageLayerIterator<'_> { #[cfg(test)] mod test { - use std::{sync::Arc, time::Duration}; + use std::sync::Arc; + use std::time::Duration; use bytes::Bytes; use itertools::Itertools; - use pageserver_api::{ - key::Key, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, - value::Value, - }; - use utils::{ - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, - }; - - use crate::{ - context::RequestContext, - tenant::{ - config::TenantConf, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::{Layer, ResidentLayer}, - vectored_blob_io::StreamingVectoredReadPlanner, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; + use pageserver_api::key::Key; + use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; + use pageserver_api::value::Value; + use utils::generation::Generation; + use utils::id::{TenantId, TimelineId}; + use utils::lsn::Lsn; use super::{ImageLayerIterator, ImageLayerWriter}; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::TenantConf; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; #[tokio::test] async fn image_layer_rewrite() { @@ -1172,10 +1162,10 @@ mod test { ..TenantConf::default() }; let tenant_id = TenantId::generate(); - let mut gen = Generation::new(0xdead0001); + let mut gen_ = Generation::new(0xdead0001); let mut get_next_gen = || { - let ret = gen; - gen = gen.next(); + let ret = gen_; + gen_ = gen_.next(); ret }; // The LSN at which we will create an image layer to filter diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 61a0fdea8c..ffdfe1dc27 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,38 +4,39 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! -use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}; +use std::sync::{Arc, OnceLock}; +use std::time::Instant; + +use anyhow::Result; +use camino::Utf8PathBuf; +use pageserver_api::key::{CompactKey, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::InMemoryLayerInfo; +use pageserver_api::shard::TenantShardId; +use tokio::sync::RwLock; +use tracing::*; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; +use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; + +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::Result; -use camino::Utf8PathBuf; -use pageserver_api::key::CompactKey; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::InMemoryLayerInfo; -use pageserver_api::shard::TenantShardId; -use std::collections::{BTreeMap, HashMap}; -use std::sync::{Arc, OnceLock}; -use std::time::Instant; -use tracing::*; -use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap}; -use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; -// avoid binding to Write (conflicts with std::io::Write) -// while being able to use std::fmt::Write's methods -use crate::metrics::TIMELINE_EPHEMERAL_BYTES; -use std::cmp::Ordering; -use std::fmt::Write; -use std::ops::Range; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::RwLock; - -use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; pub(crate) mod vectored_dio_read; @@ -555,7 +556,9 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { - trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); + trace!( + "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}" + ); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); @@ -816,8 +819,7 @@ mod tests { #[test] fn test_index_entry() { const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; - use IndexEntryNewArgs as Args; - use IndexEntryUnpacked as Unpacked; + use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked}; let roundtrip = |args, expect: Unpacked| { let res = IndexEntry::new(args).expect("this tests expects no errors"); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 1d86015fab..90455fd0ca 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -1,16 +1,13 @@ -use std::{ - collections::BTreeMap, - sync::{Arc, RwLock}, -}; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; use itertools::Itertools; use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; -use crate::{ - assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, - context::RequestContext, - virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, -}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; +use crate::context::RequestContext; +use crate::virtual_file::IoBufferMut; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. pub trait File: Send { @@ -132,7 +129,9 @@ where let req_len = match cur { LogicalReadState::NotStarted(buf) => { if buf.len() != 0 { - panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + panic!( + "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`" + ); } // buf.cap() == 0 is ok @@ -141,7 +140,9 @@ where *state = LogicalReadState::Ongoing(buf); req_len } - x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + x => panic!( + "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}" + ), }; // plan which chunks we need to read from @@ -422,15 +423,15 @@ impl Buffer for Vec { #[cfg(test)] #[allow(clippy::assertions_on_constants)] mod tests { + use std::cell::RefCell; + use std::collections::VecDeque; + use rand::Rng; - use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, - virtual_file::owned_buffers_io::slice::SliceMutExt, - }; - use super::*; - use std::{cell::RefCell, collections::VecDeque}; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; struct InMemoryFile { content: Vec, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 0bf606cf0a..bde7fbc1f9 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,32 +1,32 @@ +use std::ops::Range; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime}; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use std::ops::Range; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::{Arc, Weak}; -use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; -use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::task_mgr::TaskKind; -use crate::tenant::timeline::{CompactionError, GetVectoredError}; -use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; - use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; - -use utils::generation::Generation; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; #[cfg(test)] mod tests; @@ -324,16 +324,16 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let downloaded = self - .0 - .get_or_maybe_download(true, Some(ctx)) - .await - .map_err(|err| match err { - DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { - GetVectoredError::Cancelled - } - other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; + let downloaded = + self.0 + .get_or_maybe_download(true, ctx) + .await + .map_err(|err| match err { + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), @@ -356,8 +356,8 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> Result<(), DownloadError> { - self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> { + self.0.get_or_maybe_download(true, ctx).await?; Ok(()) } @@ -392,8 +392,11 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> Result { - let downloaded = self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download_and_keep_resident( + &self, + ctx: &RequestContext, + ) -> Result { + let downloaded = self.0.get_or_maybe_download(true, ctx).await?; Ok(ResidentLayer { downloaded, @@ -446,7 +449,7 @@ impl Layer { if verbose { // for now, unconditionally download everything, even if that might not be wanted. - let l = self.0.get_or_maybe_download(true, Some(ctx)).await?; + let l = self.0.get_or_maybe_download(true, ctx).await?; l.dump(&self.0, ctx).await? } @@ -945,7 +948,7 @@ impl LayerInner { async fn get_or_maybe_download( self: &Arc, allow_download: bool, - ctx: Option<&RequestContext>, + ctx: &RequestContext, ) -> Result, DownloadError> { let (weak, permit) = { // get_or_init_detached can: @@ -1035,21 +1038,14 @@ impl LayerInner { return Err(DownloadError::NotFile(ft)); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + self.check_expected_download(ctx)?; if !allow_download { // this is only used from tests, but it is hard to test without the boolean return Err(DownloadError::DownloadRequired); } - let download_ctx = ctx - .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) - .unwrap_or(RequestContext::new( - TaskKind::LayerDownload, - DownloadBehavior::Download, - )); + let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1873,8 +1869,8 @@ impl ResidentLayer { self.owner.record_access(ctx); let res = match inner { - Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, - Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, + Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, + Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, }; res.with_context(|| format!("Layer index is corrupted for {self}")) } @@ -1920,7 +1916,7 @@ impl ResidentLayer { let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { - Delta(ref d) => d + Delta(d) => d .copy_prefix(writer, until, ctx) .await .with_context(|| format!("copy_delta_prefix until {until} of {self}")), @@ -1943,7 +1939,7 @@ impl ResidentLayer { ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(ref d) => Ok(d), + Delta(d) => Ok(d), Image(_) => Err(anyhow::anyhow!("image layer")), } } @@ -1955,7 +1951,7 @@ impl ResidentLayer { ) -> anyhow::Result<&image_layer::ImageLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Image(ref d) => Ok(d), + Image(d) => Ok(d), Delta(_) => Err(anyhow::anyhow!("delta layer")), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index d93c378ffc..d43dfefdbc 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,22 +1,16 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::{Key, CONTROLFILE_KEY}; +use pageserver_api::key::{CONTROLFILE_KEY, Key}; use tokio::task::JoinSet; -use utils::{ - completion::{self, Completion}, - id::TimelineId, -}; +use utils::completion::{self, Completion}; +use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{ - context::DownloadBehavior, - tenant::{ - harness::test_img, - storage_layer::{IoConcurrency, LayerVisibilityHint}, - }, -}; -use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; +use crate::context::DownloadBehavior; +use crate::task_mgr::TaskKind; +use crate::tenant::harness::{TenantHarness, test_img}; +use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); @@ -184,7 +178,7 @@ async fn smoke_test() { // plain downloading is rarely needed layer - .download_and_keep_resident() + .download_and_keep_resident(&ctx) .instrument(download_span) .await .unwrap(); @@ -385,7 +379,7 @@ fn read_wins_pending_eviction() { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -520,7 +514,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -648,6 +642,11 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { .await .unwrap(); + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); + let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -680,7 +679,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // simulate a cancelled read which is cancelled before it gets to re-initialize let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!( @@ -704,7 +703,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // failpoint is still enabled, but it is not hit let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); @@ -728,6 +727,11 @@ async fn evict_and_wait_does_not_wait_for_download() { .await .unwrap(); + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); + let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -771,10 +775,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let (arrival, _download_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - let mut download = std::pin::pin!(layer - .0 - .get_or_maybe_download(true, None) - .instrument(download_span)); + let mut download = std::pin::pin!( + layer + .0 + .get_or_maybe_download(true, &ctx) + .instrument(download_span) + ); assert!( !layer.is_likely_resident(), diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2097e90764..ed16dcaa0d 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,16 +1,15 @@ use core::fmt::Display; -use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{id::TimelineId, lsn::Lsn}; use pageserver_api::key::Key; - -use super::{DeltaLayerName, ImageLayerName, LayerName}; - +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; - #[cfg(test)] use utils::id::TenantId; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use super::{DeltaLayerName, ImageLayerName, LayerName}; /// A unique identifier of a persistent layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index addf3b85d9..0f7995f87b 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -1,12 +1,12 @@ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! -use pageserver_api::key::Key; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use pageserver_api::key::Key; use utils::lsn::Lsn; use super::PersistentLayerDesc; @@ -305,7 +305,7 @@ impl FromStr for LayerName { (None, None) => { return Err(format!( "neither delta nor image layer file name: {value:?}" - )) + )); } (Some(delta), None) => Self::Delta(delta), (None, Some(image)) => Self::Image(image), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 19cfcb0867..76cdddd06a 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -1,21 +1,16 @@ -use std::{ - cmp::Ordering, - collections::{binary_heap, BinaryHeap}, - sync::Arc, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, binary_heap}; +use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; +use pageserver_api::value::Value; use utils::lsn::Lsn; +use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; +use super::image_layer::{ImageLayerInner, ImageLayerIterator}; +use super::{PersistentLayerDesc, PersistentLayerKey}; use crate::context::RequestContext; -use pageserver_api::value::Value; - -use super::{ - delta_layer::{DeltaLayerInner, DeltaLayerIterator}, - image_layer::{ImageLayerInner, ImageLayerIterator}, - PersistentLayerDesc, PersistentLayerKey, -}; #[derive(Clone, Copy)] pub(crate) enum LayerRef<'a> { @@ -349,24 +344,18 @@ impl<'a> MergeIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; - use utils::lsn::Lsn; - - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, - }, - DEFAULT_PG_VERSION, - }; - - #[cfg(feature = "testing")] - use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; + use utils::lsn::Lsn; + + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + #[cfg(feature = "testing")] + use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; + use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}; async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 5e63f59fd8..c90f81889b 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,24 +8,24 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use rand::Rng; use scopeguard::defer; use tokio::sync::{Semaphore, SemaphorePermit}; use tokio_util::sync::CancellationToken; use tracing::*; - -use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS}; -use crate::tenant::throttle::Stats; -use crate::tenant::timeline::compaction::CompactionOutcome; -use crate::tenant::timeline::CompactionError; -use crate::tenant::{Tenant, TenantState}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::throttle::Stats; +use crate::tenant::timeline::CompactionError; +use crate::tenant::timeline::compaction::CompactionOutcome; +use crate::tenant::{Tenant, TenantState}; + /// Semaphore limiting concurrent background tasks (across all tenants). /// /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. @@ -287,15 +287,16 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { - use crate::pgdatadir_mapping::CollectKeySpaceError; - use crate::tenant::upload_queue::NotInitialized; - use crate::tenant::PageReconstructError; use CompactionError::*; + use crate::tenant::PageReconstructError; + use crate::tenant::upload_queue::NotInitialized; + let level = match err { + e if e.is_cancel() => return, ShuttingDown => return, Offload(_) => Level::ERROR, - CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 300d779125..6c37c3771b 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::Instant, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; use arc_swap::ArcSwap; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 319c5e3d87..851f84f603 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,55 +14,6 @@ pub mod span; pub mod uninit; mod walreceiver; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use arc_swap::{ArcSwap, ArcSwapOption}; -use bytes::Bytes; -use camino::Utf8Path; -use chrono::{DateTime, Utc}; -use compaction::CompactionOutcome; -use enumset::EnumSet; -use fail::fail_point; -use futures::FutureExt; -use futures::{stream::FuturesUnordered, StreamExt}; -use handle::ShardTimelineId; -use layer_manager::Shutdown; -use offload::OffloadError; -use once_cell::sync::Lazy; -use pageserver_api::models::PageTraceEvent; -use pageserver_api::{ - key::{ - KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - SPARSE_RANGE, - }, - keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, - models::{ - CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, - }, - reltag::BlockNumber, - shard::{ShardIdentity, ShardNumber, TenantShardId}, -}; -use rand::Rng; -use remote_storage::DownloadError; -use serde_with::serde_as; -use storage_broker::BrokerClientChannel; -use tokio::runtime::Handle; -use tokio::sync::mpsc::Sender; -use tokio::sync::{oneshot, watch, Notify}; -use tokio_util::sync::CancellationToken; -use tracing::*; -use utils::critical; -use utils::rate_limit::RateLimit; -use utils::{ - fs_ext, - guard_arc_swap::GuardArcSwap, - pausable_failpoint, - postgres_client::PostgresClientProtocol, - sync::gate::{Gate, GateGuard}, -}; -use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; - use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; @@ -72,74 +23,58 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::l0_flush::{self, L0FlushGlobalState}; -use crate::tenant::storage_layer::ImageLayerName; -use crate::{ - aux_file::AuxFileSizeEstimator, - page_service::TenantManagerTypes, - tenant::{ - config::AttachmentMode, - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, - storage_layer::{ - inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc, - ValueReconstructSituation, - }, - }, - walingest::WalLagCooldown, - walredo, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - disk_usage_eviction_task::DiskUsageEvictionInfo, - pgdatadir_mapping::CollectKeySpaceError, -}; -use crate::{ - disk_usage_eviction_task::finite_f32, - tenant::storage_layer::{ - AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, - ValuesReconstructState, - }, -}; -use crate::{ - disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, -}; -use crate::{ - metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, -}; -use crate::{ - pgdatadir_mapping::DirectoryKind, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; +use anyhow::{Context, Result, anyhow, bail, ensure}; +use arc_swap::{ArcSwap, ArcSwapOption}; +use bytes::Bytes; +use camino::Utf8Path; +use chrono::{DateTime, Utc}; +use compaction::{CompactionOutcome, GcCompactionCombinedSettings}; +use enumset::EnumSet; +use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; +use handle::ShardTimelineId; +use layer_manager::Shutdown; +use offload::OffloadError; +use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; - -use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; -use crate::tenant::config::TenantConfOpt; -use pageserver_api::reltag::RelTag; -use pageserver_api::shard::ShardIndex; - -use postgres_connection::PgConnectionConfig; -use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE}; -use utils::{ - completion, - generation::Generation, - id::TimelineId, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, - simple_rcu::{Rcu, RcuReadGuard}, +use pageserver_api::key::{ + KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + SPARSE_RANGE, }; - -use crate::task_mgr; -use crate::task_mgr::TaskKind; -use crate::tenant::gc_result::GcResult; -use crate::ZERO_PAGE; -use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; +use pageserver_api::models::{ + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, +}; +use pageserver_api::reltag::{BlockNumber, RelTag}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; +#[cfg(test)] +use pageserver_api::value::Value; +use postgres_connection::PgConnectionConfig; +use postgres_ffi::v14::xlog_utils; +use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; +use rand::Rng; +use remote_storage::DownloadError; +use serde_with::serde_as; +use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; +use tokio::sync::mpsc::Sender; +use tokio::sync::{Notify, oneshot, watch}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::generation::Generation; +use utils::guard_arc_swap::GuardArcSwap; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; +use utils::postgres_client::PostgresClientProtocol; +use utils::rate_limit::RateLimit; +use utils::seqwait::SeqWait; +use utils::simple_rcu::{Rcu, RcuReadGuard}; +use utils::sync::gate::{Gate, GateGuard}; +use utils::{completion, critical, fs_ext, pausable_failpoint}; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; @@ -147,23 +82,48 @@ use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; - +use super::config::TenantConf; +use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; +use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; +use super::secondary::heatmap::HeatMapLayer; +use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::upload_queue::NotInitialized; use super::{ - config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, - MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + debug_assert_current_span_has_tenant_and_timeline_id, }; -use super::{ - debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline, +use crate::aux_file::AuxFileSizeEstimator; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::l0_flush::{self, L0FlushGlobalState}; +use crate::metrics::{ + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; -use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{ - remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, - storage_layer::ReadableLayer, +use crate::page_service::TenantManagerTypes; +use crate::pgdatadir_mapping::{ + CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp, + MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; -use super::{secondary::heatmap::HeatMapLayer, GcError}; - -#[cfg(test)] -use pageserver_api::value::Value; +use crate::task_mgr::TaskKind; +use crate::tenant::config::{AttachmentMode, TenantConfOpt}; +use crate::tenant::gc_result::GcResult; +use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::storage_layer::delta_layer::DeltaEntry; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; +use crate::tenant::storage_layer::{ + AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName, + ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation, + ValueReconstructState, ValuesReconstructState, +}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::walingest::WalLagCooldown; +use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(crate) enum FlushLoopState { @@ -323,6 +283,9 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, + // The LSN of gc-compaction that was last applied to this timeline. + gc_compaction_state: ArcSwap>, + pub(super) metrics: TimelineMetrics, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code @@ -479,6 +442,8 @@ pub(crate) enum PreviousHeatmap { Active { heatmap: HeatMapTimeline, read_at: std::time::Instant, + // End LSN covered by the heatmap if known + end_lsn: Option, }, Obsolete, } @@ -1470,13 +1435,22 @@ impl Timeline { | TaskKind::WalReceiverConnectionHandler | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { - WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), - WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + WaitLsnWaiter::Timeline(waiter) => { + Weak::ptr_eq(&waiter.myself, &self.myself) + } + WaitLsnWaiter::Tenant + | WaitLsnWaiter::PageService + | WaitLsnWaiter::HttpEndpoint => unreachable!( + "tenant or page_service context are not expected to have task kind {:?}", + ctx.task_kind() + ), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here - panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + panic!( + "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock" + ); } } else { // if another timeline's is waiting for us, there's no deadlock risk because @@ -1505,12 +1479,12 @@ impl Timeline { drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", - lsn, - self.get_last_record_lsn(), - self.get_disk_consistent_lsn(), - walreceiver_status, - ))) + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status, + ))) } } } @@ -1614,10 +1588,18 @@ impl Timeline { if init || validate { let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); } if lsn < planned_cutoff { - bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff); + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } } @@ -1741,7 +1723,9 @@ impl Timeline { // This is not harmful, but it only happens in relatively rare cases where // time-based checkpoints are not happening fast enough to keep the amount of // ephemeral data within configured limits. It's a sign of stress on the system. - tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + tracing::info!( + "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure" + ); } } @@ -1867,7 +1851,9 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { - warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + warn!( + "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}" + ); return Ok(CompactionOutcome::Skipped); } @@ -1880,15 +1866,25 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. - match result { + match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { + Err(e) if e.is_cancel() => {} + Err(CompactionError::ShuttingDown) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::AlreadyRunning(_)) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + Err(CompactionError::CollectKeySpaceError(_)) => { + // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to // abruptly stall nor resume L0 flushes in these cases. Err(CompactionError::Offload(_)) => {} - Err(CompactionError::ShuttingDown) => {} }; result @@ -2028,7 +2024,9 @@ impl Timeline { // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but // we also do a final check here to ensure that the queue is empty. if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } } @@ -2037,7 +2035,9 @@ impl Timeline { // drain the upload queue self.remote_client.shutdown().await; if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } @@ -2199,6 +2199,7 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, + ctx: &RequestContext, ) -> Result, super::storage_layer::layer::DownloadError> { let Some(layer) = self .find_layer(layer_file_name) @@ -2212,7 +2213,7 @@ impl Timeline { return Ok(None); }; - layer.download().await?; + layer.download(ctx).await?; Ok(Some(true)) } @@ -2531,6 +2532,31 @@ impl Timeline { ) } + fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings { + let tenant_conf = &self.tenant_conf.load(); + let gc_compaction_enabled = tenant_conf + .tenant_conf + .gc_compaction_enabled + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_initial_threshold_kb = tenant_conf + .tenant_conf + .gc_compaction_initial_threshold_kb + .unwrap_or( + self.conf + .default_tenant_conf + .gc_compaction_initial_threshold_kb, + ); + let gc_compaction_ratio_percent = tenant_conf + .tenant_conf + .gc_compaction_ratio_percent + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); + GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } + } + fn get_image_creation_preempt_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2609,6 +2635,7 @@ impl Timeline { state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, + gc_compaction_state: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2667,6 +2694,8 @@ impl Timeline { }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), @@ -2831,6 +2860,20 @@ impl Timeline { ); } + pub(crate) fn update_gc_compaction_state( + &self, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + self.gc_compaction_state + .store(Arc::new(Some(gc_compaction_state.clone()))); + self.remote_client + .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) + } + + pub(crate) fn get_gc_compaction_state(&self) -> Option { + self.gc_compaction_state.load_full().as_ref().clone() + } + /// Creates and starts the wal receiver. /// /// This function is expected to be called at most once per Timeline's lifecycle @@ -2899,8 +2942,9 @@ impl Timeline { disk_consistent_lsn: Lsn, index_part: IndexPart, ) -> anyhow::Result<()> { - use init::{Decision::*, Discovered, DismissedLayer}; use LayerName::*; + use init::Decision::*; + use init::{Discovered, DismissedLayer}; let mut guard = self.layers.write().await; @@ -3115,11 +3159,15 @@ impl Timeline { } TimelineState::Loading => { // Import does not return an activated timeline. - info!("discarding priority boost for logical size calculation because timeline is not yet active"); + info!( + "discarding priority boost for logical size calculation because timeline is not yet active" + ); } TimelineState::Active => { // activation should be setting the once cell - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + warn!( + "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work" + ); debug_assert!(false); } } @@ -3524,12 +3572,16 @@ impl Timeline { Ok(layer) } - pub(super) fn is_previous_heatmap_active(&self) -> bool { - self.previous_heatmap - .load() - .as_ref() - .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) - .unwrap_or(false) + pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool { + let crnt = self.previous_heatmap.load(); + match crnt.as_deref() { + Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn { + Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn, + None => true, + }, + Some(PreviousHeatmap::Obsolete) => false, + None => false, + } } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3557,26 +3609,26 @@ impl Timeline { // heatamp. let previous_heatmap = self.previous_heatmap.load(); let visible_non_resident = match previous_heatmap.as_deref() { - Some(PreviousHeatmap::Active { heatmap, read_at }) => { - Some(heatmap.layers.iter().filter_map(|hl| { - let desc: PersistentLayerDesc = hl.name.clone().into(); - let layer = guard.try_get_from_key(&desc.key())?; + Some(PreviousHeatmap::Active { + heatmap, read_at, .. + }) => Some(heatmap.layers.iter().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; - if layer.visibility() == LayerVisibilityHint::Covered { - return None; - } + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } - if layer.is_likely_resident() { - return None; - } + if layer.is_likely_resident() { + return None; + } - if layer.last_evicted_at().happened_after(*read_at) { - return None; - } + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } - Some((desc, hl.metadata.clone(), hl.access_time)) - })) - } + Some((desc, hl.metadata.clone(), hl.access_time)) + })), Some(PreviousHeatmap::Obsolete) => None, None => None, }; @@ -3663,6 +3715,7 @@ impl Timeline { PreviousHeatmap::Active { heatmap, read_at: Instant::now(), + end_lsn: Some(end_lsn), } } @@ -4166,10 +4219,6 @@ impl Timeline { // Stall flushes to backpressure if compaction can't keep up. This is propagated up // to WAL ingestion by having ephemeral layer rolls wait for flushes. - // - // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so - // we can end up stalling before compaction even starts. Consider making it more - // responsive (e.g. via `watch_level0_deltas`). if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { if l0_count >= stall_threshold { warn!( @@ -4259,10 +4308,14 @@ impl Timeline { // This path is only taken for tenants with multiple shards: single sharded tenants should // never encounter a gap in the wal. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + tracing::debug!( + "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}" + ); if self.set_disk_consistent_lsn(frozen_to_lsn) { if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { - tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + tracing::warn!( + "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}" + ); } } } @@ -4487,7 +4540,10 @@ impl Timeline { /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { let old_value = self.disk_consistent_lsn.fetch_max(new_value); - assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + assert!( + new_value >= old_value, + "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}" + ); self.metrics .disk_consistent_lsn_gauge @@ -4645,10 +4701,7 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self - .collect_keyspace(lsn, ctx) - .await - .map_err(CompactionError::CollectKeySpaceError)?; + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -4782,7 +4835,9 @@ impl Timeline { // any metadata keys, keys, as that would lead to actual data // loss. if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + warn!( + "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}" + ); ZERO_PAGE.clone() } else { return Err(CreateImageLayersError::from(err)); @@ -4861,7 +4916,8 @@ impl Timeline { let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; info!( - "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() + "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", + elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { @@ -5183,7 +5239,8 @@ impl Timeline { if should_yield { tracing::info!( "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", - partition.start().unwrap(), partition.end().unwrap() + partition.start().unwrap(), + partition.end().unwrap() ); last_partition_processed = Some(partition.clone()); all_generated = false; @@ -5370,9 +5427,40 @@ pub(crate) enum CompactionError { Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(CollectKeySpaceError), + CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), + #[error("Compaction already running: {0}")] + AlreadyRunning(&'static str), +} + +impl CompactionError { + /// Errors that can be ignored, i.e., cancel and shutdown. + pub fn is_cancel(&self) -> bool { + matches!( + self, + Self::ShuttingDown + | Self::AlreadyRunning(_) + | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) + | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( + PageReconstructError::Cancelled + )) + | Self::Offload(OffloadError::Cancelled) + ) + } + + /// Critical errors that indicate data corruption. + pub fn is_critical(&self) -> bool { + matches!( + self, + Self::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ) + ) + ) + } } impl From for CompactionError { @@ -5384,18 +5472,6 @@ impl From for CompactionError { } } -impl From for CompactionError { - fn from(err: CollectKeySpaceError) -> Self { - match err { - CollectKeySpaceError::Cancelled - | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { - CompactionError::ShuttingDown - } - e => CompactionError::Other(e.into()), - } - } -} - impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { @@ -5539,7 +5615,9 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { - return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + return Err(CompactionError::Other(anyhow::anyhow!( + "compaction generates a L0 layer file as output, which will cause infinite compaction." + ))); } else { insert_layers.push(l.clone()); } @@ -5663,8 +5741,10 @@ impl Timeline { .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", - index_part.metadata.latest_gc_cutoff_lsn()); + tracing::info!( + "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn() + ); Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) } Err(DownloadError::NotFound) => { @@ -6073,9 +6153,7 @@ impl Timeline { if let Some((img_lsn, img)) = &data.img { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", - key, - img_lsn, - request_lsn, + key, img_lsn, request_lsn, ); Ok(img.clone()) } else { @@ -6104,7 +6182,12 @@ impl Timeline { request_lsn ); } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + trace!( + "found {} WAL records that will init the page for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); }; let res = self .walredo_mgr @@ -6131,6 +6214,7 @@ impl Timeline { pub(crate) async fn spawn_download_all_remote_layers( self: Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) -> Result { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6151,6 +6235,10 @@ impl Timeline { } let self_clone = Arc::clone(&self); + let task_ctx = ctx.detached_child( + TaskKind::DownloadAllRemoteLayers, + DownloadBehavior::Download, + ); let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, @@ -6158,7 +6246,7 @@ impl Timeline { Some(self.timeline_id), "download all remote layers task", async move { - self_clone.download_all_remote_layers(request).await; + self_clone.download_all_remote_layers(request, &task_ctx).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); match &mut *status_guard { None => { @@ -6193,6 +6281,7 @@ impl Timeline { async fn download_all_remote_layers( self: &Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6249,9 +6338,10 @@ impl Timeline { let span = tracing::info_span!("download", layer = %next); + let ctx = ctx.attached_child(); js.spawn( async move { - let res = next.download().await; + let res = next.download(&ctx).await; (next, res) } .instrument(span), @@ -6648,7 +6738,9 @@ impl TimelineWriter<'_> { if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { - debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + debug!( + "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers" + ); self.tl.wait_flush_completion(flush_id).await?; } } @@ -6835,17 +6927,16 @@ mod tests { use pageserver_api::key::Key; use pageserver_api::value::Value; use tracing::Instrument; - use utils::{id::TimelineId, lsn::Lsn}; - - use crate::tenant::{ - harness::{test_img, TenantHarness}, - layer_map::LayerMap, - storage_layer::{Layer, LayerName, LayerVisibilityHint}, - timeline::{DeltaLayerTestDesc, EvictionError}, - PreviousHeatmap, Timeline, - }; + use utils::id::TimelineId; + use utils::lsn::Lsn; use super::HeatMapTimeline; + use crate::context::RequestContextBuilder; + use crate::tenant::harness::{TenantHarness, test_img}; + use crate::tenant::layer_map::LayerMap; + use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; + use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; + use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { assert_eq!(lhs.layers.len(), rhs.layers.len()); @@ -6962,6 +7053,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Generate a new heatmap and assert that it contains the same layers as the old one. @@ -6977,8 +7069,12 @@ mod tests { eprintln!("Downloading {layer} and re-generating heatmap"); + let ctx = &RequestContextBuilder::extend(&ctx) + .download_behavior(crate::context::DownloadBehavior::Download) + .build(); + let _resident = layer - .download_and_keep_resident() + .download_and_keep_resident(ctx) .instrument(tracing::info_span!( parent: None, "download_layer", @@ -7060,6 +7156,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Evict all the layers in the previous heatmap diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index 6009b0b79a..96864ec44b 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -1,4 +1,5 @@ -use std::{collections::BTreeSet, ops::Range}; +use std::collections::BTreeSet; +use std::ops::Range; use utils::lsn::Lsn; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d75591bd74..c835980a7d 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -10,34 +10,42 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, + GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::key::KEY_SIZE; -use pageserver_api::keyspace::ShardedRange; +use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; +use pageserver_api::key::{KEY_SIZE, Key}; +use pageserver_api::keyspace::{KeySpace, ShardedRange}; use pageserver_api::models::CompactInfoResponse; +use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use pageserver_compaction::helpers::{fully_contains, overlaps_with}; +use pageserver_compaction::interface::*; use serde::Serialize; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, trace, warn}; use utils::critical; use utils::id::TimelineId; +use utils::lsn::Lsn; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, }; @@ -46,24 +54,12 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; -use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; -use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; +use crate::tenant::timeline::{ + DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, + ResidentLayer, drop_rlock, +}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; - -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; - -use utils::lsn::Lsn; - -use pageserver_compaction::helpers::{fully_contains, overlaps_with}; -use pageserver_compaction::interface::*; - -use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; @@ -77,13 +73,22 @@ impl std::fmt::Display for GcCompactionJobId { } } +pub struct GcCompactionCombinedSettings { + pub gc_compaction_enabled: bool, + pub gc_compaction_initial_threshold_kb: u64, + pub gc_compaction_ratio_percent: u64, +} + #[derive(Debug, Clone)] pub enum GcCompactionQueueItem { - Manual(CompactOptions), + MetaJob { + /// Compaction options + options: CompactOptions, + /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) + auto: bool, + }, SubCompactionJob(CompactOptions), - #[allow(dead_code)] - UpdateL2Lsn(Lsn), - Notify(GcCompactionJobId), + Notify(GcCompactionJobId, Option), } impl GcCompactionQueueItem { @@ -93,7 +98,7 @@ impl GcCompactionQueueItem { running: bool, ) -> Option { match self { - GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse { + GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, @@ -107,17 +112,22 @@ impl GcCompactionQueueItem { running, job_id: id.0, }), - GcCompactionQueueItem::UpdateL2Lsn(_) => None, - GcCompactionQueueItem::Notify(_) => None, + GcCompactionQueueItem::Notify(_, _) => None, } } } +#[derive(Default)] +struct GcCompactionGuardItems { + notify: Option>, + gc_guard: Option, + permit: Option, +} + struct GcCompactionQueueInner { running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, - notify: HashMap>, - gc_guards: HashMap, + guards: HashMap, last_id: GcCompactionJobId, } @@ -137,14 +147,18 @@ pub struct GcCompactionQueue { consumer_lock: tokio::sync::Mutex<()>, } +static CONCURRENT_GC_COMPACTION_TASKS: Lazy> = Lazy::new(|| { + // Only allow two timelines on one pageserver to run gc compaction at a time. + Arc::new(Semaphore::new(2)) +}); + impl GcCompactionQueue { pub fn new() -> Self { GcCompactionQueue { inner: std::sync::Mutex::new(GcCompactionQueueInner { running: None, queued: VecDeque::new(), - notify: HashMap::new(), - gc_guards: HashMap::new(), + guards: HashMap::new(), last_id: GcCompactionJobId(0), }), consumer_lock: tokio::sync::Mutex::new(()), @@ -154,8 +168,9 @@ impl GcCompactionQueue { pub fn cancel_scheduled(&self) { let mut guard = self.inner.lock().unwrap(); guard.queued.clear(); - guard.notify.clear(); - guard.gc_guards.clear(); + // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel + // API is only used for testing purposes, so we can drop everything here. + guard.guards.clear(); } /// Schedule a manual compaction job. @@ -166,29 +181,163 @@ impl GcCompactionQueue { ) -> GcCompactionJobId { let mut guard = self.inner.lock().unwrap(); let id = guard.next_id(); - guard - .queued - .push_back((id, GcCompactionQueueItem::Manual(options))); - if let Some(notify) = notify { - guard.notify.insert(id, notify); - } + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: false, + }, + )); + guard.guards.entry(id).or_default().notify = notify; info!("scheduled compaction job id={}", id); id } + /// Schedule an auto compaction job. + fn schedule_auto_compaction( + &self, + options: CompactOptions, + permit: OwnedSemaphorePermit, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: true, + }, + )); + guard.guards.entry(id).or_default().permit = Some(permit); + id + } + /// Trigger an auto compaction. - #[allow(dead_code)] - pub fn trigger_auto_compaction(&self, _: &Arc) {} + pub async fn trigger_auto_compaction( + &self, + timeline: &Arc, + ) -> Result<(), CompactionError> { + let GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } = timeline.get_gc_compaction_settings(); + if !gc_compaction_enabled { + return Ok(()); + } + if self.remaining_jobs_num() > 0 { + // Only schedule auto compaction when the queue is empty + return Ok(()); + } + if timeline.ancestor_timeline().is_some() { + // Do not trigger auto compaction for child timelines. We haven't tested + // it enough in staging yet. + return Ok(()); + } + + let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { + // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure + // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` + // to ensure the fairness while avoid starving other tasks. + return Ok(()); + }; + + let gc_compaction_state = timeline.get_gc_compaction_state(); + let l2_lsn = gc_compaction_state + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + + let layers = { + let guard = timeline.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + let mut l2_size: u64 = 0; + let mut l1_size = 0; + let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn(); + for layer in layers { + if layer.lsn_range.start <= l2_lsn { + l2_size += layer.file_size(); + } else if layer.lsn_range.start <= gc_cutoff { + l1_size += layer.file_size(); + } + } + + fn trigger_compaction( + l1_size: u64, + l2_size: u64, + gc_compaction_initial_threshold_kb: u64, + gc_compaction_ratio_percent: u64, + ) -> bool { + const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB + if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + // Do not auto-trigger when physical size >= 150GB + return false; + } + // initial trigger + if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 { + info!( + "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}", + l1_size, gc_compaction_initial_threshold_kb + ); + return true; + } + // size ratio trigger + if l2_size == 0 { + return false; + } + if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) { + info!( + "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}", + l1_size, l2_size, gc_compaction_ratio_percent + ); + return true; + } + false + } + + if trigger_compaction( + l1_size, + l2_size, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + ) { + self.schedule_auto_compaction( + CompactOptions { + flags: { + let mut flags = EnumSet::new(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + flags + }, + sub_compaction: true, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction_max_job_size_mb: None, + }, + permit, + ); + info!( + "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } else { + info!( + "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } + Ok(()) + } /// Notify the caller the job has finished and unblock GC. fn notify_and_unblock(&self, id: GcCompactionJobId) { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); - if let Some(blocking) = guard.gc_guards.remove(&id) { - drop(blocking) - } - if let Some(tx) = guard.notify.remove(&id) { - let _ = tx.send(()); + if let Some(items) = guard.guards.remove(&id) { + drop(items.gc_guard); + if let Some(tx) = items.notify { + let _ = tx.send(()); + } } } @@ -198,9 +347,12 @@ impl GcCompactionQueue { options: CompactOptions, timeline: &Arc, gc_block: &GcBlock, + auto: bool, ) -> Result<(), CompactionError> { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs: Vec = timeline + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + let jobs = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, @@ -223,6 +375,9 @@ impl GcCompactionQueue { let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); + // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. + // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN. + let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max(); for job in jobs { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. @@ -240,10 +395,16 @@ impl GcCompactionQueue { }; pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); } - pending_tasks.push(GcCompactionQueueItem::Notify(id)); + + if !auto { + pending_tasks.push(GcCompactionQueueItem::Notify(id, None)); + } else { + pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn)); + } + { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); @@ -254,7 +415,10 @@ impl GcCompactionQueue { guard.queued.push_front(item); } } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); + info!( + "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", + jobs_len + ); } Ok(()) } @@ -267,29 +431,48 @@ impl GcCompactionQueue { gc_block: &GcBlock, timeline: &Arc, ) -> Result { - let _one_op_at_a_time_guard = self.consumer_lock.lock().await; - let has_pending_tasks; - let (id, item) = { - let mut guard = self.inner.lock().unwrap(); - let Some((id, item)) = guard.queued.pop_front() else { - return Ok(CompactionOutcome::Done); - }; - guard.running = Some((id, item.clone())); - has_pending_tasks = !guard.queued.is_empty(); - (id, item) + let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { + return Err(CompactionError::AlreadyRunning( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", + )); + }; + let has_pending_tasks; + let Some((id, item)) = ({ + let mut guard = self.inner.lock().unwrap(); + if let Some((id, item)) = guard.queued.pop_front() { + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + Some((id, item)) + } else { + has_pending_tasks = false; + None + } + }) else { + self.trigger_auto_compaction(timeline).await?; + // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we + // have not implemented preemption mechanism yet. We always want to yield it to more important + // tasks if there is one. + return Ok(CompactionOutcome::Done); }; - match item { - GcCompactionQueueItem::Manual(options) => { + GcCompactionQueueItem::MetaJob { options, auto } => { if !options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options); + warn!( + "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", + options + ); } else if options.sub_compaction { - self.handle_sub_compaction(id, options, timeline, gc_block) + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + self.handle_sub_compaction(id, options, timeline, gc_block, auto) .await?; } else { + // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn + // in this branch. let gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { @@ -301,20 +484,37 @@ impl GcCompactionQueue { }; { let mut guard = self.inner.lock().unwrap(); - guard.gc_guards.insert(id, gc_guard); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); } let _ = timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); } } GcCompactionQueueItem::SubCompactionJob(options) => { + // TODO: error handling, clear the queue if any task fails? let _ = timeline.compact_with_options(cancel, options, ctx).await?; } - GcCompactionQueueItem::Notify(id) => { + GcCompactionQueueItem::Notify(id, l2_lsn) => { self.notify_and_unblock(id); - } - GcCompactionQueueItem::UpdateL2Lsn(_) => { - unreachable!() + if let Some(l2_lsn) = l2_lsn { + let current_l2_lsn = timeline + .get_gc_compaction_state() + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + if l2_lsn >= current_l2_lsn { + info!("l2_lsn updated to {}", l2_lsn); + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: l2_lsn, + }) + .map_err(CompactionError::Other)?; + } else { + warn!( + "l2_lsn updated to {} but it is less than the current l2_lsn {}", + l2_lsn, current_l2_lsn + ); + } + } } } { @@ -339,7 +539,6 @@ impl GcCompactionQueue { (guard.running.clone(), guard.queued.clone()) } - #[allow(dead_code)] pub fn remaining_jobs_num(&self) -> usize { let guard = self.inner.lock().unwrap(); guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } @@ -771,24 +970,21 @@ impl Timeline { self.upload_new_image_layers(image_layers)?; if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. - info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + info!( + "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)." + ); return Ok(CompactionOutcome::YieldForL0); } } // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} - Err(CompactionError::ShuttingDown) => {} + Err(err) if err.is_cancel() => {} // Alert on critical errors that indicate data corruption. - Err( - err @ CompactionError::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ), - ), - ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + Err(err) if err.is_critical() => { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); + } // Log other errors. No partitioning? This is normal, if the timeline was just created // as an empty timeline. Also in unit tests, when we use the timeline as a simple @@ -796,7 +992,7 @@ impl Timeline { Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; - let partition_count = self.partitioning.read().0 .0.parts.len(); + let partition_count = self.partitioning.read().0.0.parts.len(); // 4. Shard ancestor compaction @@ -961,7 +1157,7 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer.download_and_keep_resident().await?; + let resident = layer.download_and_keep_resident(ctx).await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) @@ -1005,7 +1201,7 @@ impl Timeline { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown) + return Err(CompactionError::ShuttingDown); } } @@ -1189,14 +1385,14 @@ impl Timeline { let mut fully_compacted = true; - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact.push(l.download_and_keep_resident(ctx).await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -1300,7 +1496,7 @@ impl Timeline { let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? - // min-heap (reserve space for one more element added before eviction) + // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; @@ -2163,8 +2359,14 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", - available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + return Err(anyhow!( + "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, + allocated_space, + all_layer_size, + remote_layer_size, + all_layer_size + remote_layer_size + )); } Ok(()) } @@ -2203,7 +2405,9 @@ impl Timeline { }; if compact_below_lsn == Lsn::INVALID { - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); return Ok(vec![]); } @@ -2348,7 +2552,9 @@ impl Timeline { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); if sub_compaction { - info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); + info!( + "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); let jobs = self .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb) .await?; @@ -2400,7 +2606,13 @@ impl Timeline { let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); - info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end); + info!( + "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", + compact_key_range.start, + compact_key_range.end, + compact_lsn_range.start, + compact_lsn_range.end + ); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); @@ -2429,7 +2641,9 @@ impl Timeline { let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { if real_gc_cutoff == Lsn::INVALID { // If the gc_cutoff is not generated yet, we should not compact anything. - tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"); + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); return Ok(()); } real_gc_cutoff @@ -2437,7 +2651,10 @@ impl Timeline { compact_lsn_range.end }; if gc_cutoff > real_gc_cutoff { - warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff); + warn!( + "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", + gc_cutoff, real_gc_cutoff + ); gc_cutoff = real_gc_cutoff; } gc_cutoff @@ -2461,7 +2678,10 @@ impl Timeline { .map(|desc| desc.get_lsn_range().end) .max() else { - info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff); + info!( + "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", + gc_cutoff + ); return Ok(()); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below @@ -2479,7 +2699,10 @@ impl Timeline { .map(|desc| desc.get_lsn_range().start) .min() else { - info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end); + info!( + "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", + compact_lsn_range.end + ); return Ok(()); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key @@ -2502,7 +2725,10 @@ impl Timeline { } } if selected_layers.is_empty() { - info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end); + info!( + "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", + gc_cutoff, compact_key_range.start, compact_key_range.end + ); return Ok(()); } retain_lsns_below_horizon.sort(); @@ -2584,7 +2810,10 @@ impl Timeline { .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + bail!( + "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", + err + ); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc @@ -2603,7 +2832,7 @@ impl Timeline { total_downloaded_size += layer.layer_desc().file_size; } total_layer_size += layer.layer_desc().file_size; - let resident_layer = layer.download_and_keep_resident().await?; + let resident_layer = layer.download_and_keep_resident(ctx).await?; downloaded_layers.push(resident_layer); } info!( @@ -2991,7 +3220,10 @@ impl Timeline { // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { - bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + bail!( + "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", + err + ); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only @@ -3056,7 +3288,8 @@ impl Timeline { if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { tracing::info!( "skipping delete {} because found same layer key at different generation {}", - layer, to + layer, + to ); } else { compact_from.push(layer.clone()); @@ -3175,6 +3408,7 @@ impl CompactionJobExecutor for TimelineAdaptor { async fn downcast_delta_layer( &self, layer: &OwnArc, + ctx: &RequestContext, ) -> anyhow::Result> { // this is a lot more complex than a simple downcast... if layer.is_delta() { @@ -3182,7 +3416,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let guard = self.timeline.layers.read().await; guard.get_from_desc(layer) }; - let result = l.download_and_keep_resident().await?; + let result = l.download_and_keep_resident(ctx).await?; Ok(Some(ResidentDeltaLayer(result))) } else { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 841b2fa1c7..7cdc69e55f 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -1,26 +1,26 @@ -use std::{ - ops::{Deref, DerefMut}, - sync::Arc, -}; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; use anyhow::Context; -use pageserver_api::{models::TimelineState, shard::TenantShardId}; +use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use tokio::sync::OwnedMutexGuard; -use tracing::{error, info, info_span, instrument, Instrument}; -use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; +use tracing::{Instrument, error, info, info_span, instrument}; +use utils::id::TimelineId; +use utils::{crashsafe, fs_ext, pausable_failpoint}; -use crate::{ - config::PageServerConf, - task_mgr::{self, TaskKind}, - tenant::{ - metadata::TimelineMetadata, - remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, - CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, Timeline, TimelineOrOffloaded, - }, - virtual_file::MaybeFatalIo, +use crate::config::PageServerConf; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::{ + PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, }; +use crate::tenant::{ + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError, + Timeline, TimelineOrOffloaded, +}; +use crate::virtual_file::MaybeFatalIo; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. @@ -137,6 +137,11 @@ async fn remove_maybe_offloaded_timeline_from_tenant( timelines.remove(&timeline.timeline_id).expect( "timeline that we were deleting was concurrently removed from 'timelines' map", ); + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); } TimelineOrOffloaded::Offloaded(timeline) => { let offloaded_timeline = timelines_offloaded @@ -300,6 +305,7 @@ impl DeleteTimelineFlow { // Thus we need to skip the validation here. CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here + None, // doesn't matter what we put here ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index e0084d3eef..b08003d04a 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,25 +1,28 @@ -use std::{collections::HashSet, sync::Arc}; +use std::collections::HashSet; +use std::sync::Arc; -use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::{ - remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{ - layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer, - }, - Tenant, - }, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; +use utils::completion; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateError; + +use super::layer_manager::LayerManager; +use super::{FlushLayerError, Timeline}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::Tenant; +use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -64,9 +67,10 @@ impl Error { where F: Fn(anyhow::Error) -> Error, { + use remote_storage::TimeoutOrCancel; + use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; - use remote_storage::TimeoutOrCancel; if e.is::() || TimeoutOrCancel::caused_by_cancel(&e) @@ -360,14 +364,25 @@ pub(super) async fn prepare( let mut tasks = tokio::task::JoinSet::new(); let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); + let cancel_eval = CancellationToken::new(); for adopted in rest_of_historic { let limiter = limiter.clone(); let timeline = detached.clone(); + let cancel_eval = cancel_eval.clone(); tasks.spawn( async move { - let _permit = limiter.acquire().await; + let _permit = tokio::select! { + permit = limiter.acquire() => { + permit + } + // Wait for the cancellation here instead of letting the entire task be cancelled. + // Cancellations are racy in that they might leave layers on disk. + _ = cancel_eval.cancelled() => { + Err(Error::ShuttingDown)? + } + }; let (owned, did_hardlink) = remote_copy( &adopted, &timeline, @@ -383,7 +398,22 @@ pub(super) async fn prepare( ); } + fn delete_layers(timeline: &Timeline, layers: Vec) -> Result<(), Error> { + // We are deleting layers, so we must hold the gate + let _gate = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + { + layers.into_iter().for_each(|l: Layer| { + l.delete_on_drop(); + std::mem::drop(l); + }); + } + Ok(()) + } + let mut should_fsync = false; + let mut first_err = None; while let Some(res) = tasks.join_next().await { match res { Ok(Ok((owned, did_hardlink))) => { @@ -392,13 +422,24 @@ pub(super) async fn prepare( } new_layers.push(owned); } + + // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete. Ok(Err(failed)) => { - return Err(failed); + cancel_eval.cancel(); + first_err.get_or_insert(failed); + } + Err(je) => { + cancel_eval.cancel(); + first_err.get_or_insert(Error::Prepare(je.into())); } - Err(je) => return Err(Error::Prepare(je.into())), } } + if let Some(failed) = first_err { + delete_layers(detached, new_layers)?; + return Err(failed); + } + // fsync directory again if we hardlinked something if should_fsync { fsync_timeline_dir(detached, ctx).await; @@ -588,7 +629,7 @@ async fn copy_lsn_prefix( .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) .map_err(Error::Prepare)?; - let resident = layer.download_and_keep_resident().await.map_err(|e| { + let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| { if e.is_cancelled() { Error::ShuttingDown } else { @@ -646,6 +687,11 @@ async fn remote_copy( let conf = adoptee.conf; let file_name = adopted.layer_desc().layer_name(); + // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below + let _gate = adoptee.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + // depending if Layer::keep_resident, do a hardlink let did_hardlink; let owned = if let Some(adopted_resident) = adopted.keep_resident().await { @@ -657,8 +703,32 @@ async fn remote_copy( &file_name, &metadata.generation, ); - std::fs::hard_link(adopted_path, &adoptee_path) - .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + + match std::fs::hard_link(adopted_path, &adoptee_path) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // In theory we should not get into this situation as we are doing cleanups of the layer file after errors. + // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch. + + // Double check that the file is orphan (probably from an earlier attempt), then delete it + let key = file_name.clone().into(); + if adoptee.layers.read().await.contains_key(&key) { + // We are supposed to filter out such cases before coming to this function + return Err(Error::Prepare(anyhow::anyhow!( + "layer file {file_name} already present and inside layer map" + ))); + } + tracing::info!("Deleting orphan layer file to make way for hard linking"); + // Delete orphan layer file and try again, to ensure this layer has a well understood source + std::fs::remove_file(adopted_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + } + Err(e) => { + return Err(Error::launder(e.into(), Error::Prepare)); + } + }; did_hardlink = true; Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() } else { @@ -666,12 +736,21 @@ async fn remote_copy( Layer::for_evicted(conf, adoptee, file_name, metadata) }; - let layer = adoptee + let layer = match adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await - .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare))?; + { + Ok(()) => owned, + Err(e) => { + { + // Clean up the layer so that on a retry we don't get errors that the file already exists + owned.delete_on_drop(); + std::mem::drop(owned); + } + return Err(Error::launder(e, Error::Prepare)); + } + }; Ok((layer, did_hardlink)) } @@ -780,7 +859,7 @@ pub(super) async fn detach_and_reparent( // TODO: make sure there are no `?` before tenant_reset from after a questionmark from // here. panic!( - "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" ); } }; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 77c33349e0..187d9f248e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -13,34 +13,27 @@ //! Items with parentheses are not (yet) touched by this task. //! //! See write-up on restart on-demand download spike: -use std::{ - collections::HashMap, - ops::ControlFlow, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; - -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::CollectKeySpaceError, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{ - size::CalculateSyntheticSizeError, - storage_layer::LayerVisibilityHint, - tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit}, - timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, - }, -}; - -use utils::{completion, sync::gate::GateGuard}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; +use utils::completion; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::storage_layer::LayerVisibilityHint; +use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; +use crate::tenant::timeline::EvictionError; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; #[derive(Default)] pub struct EvictionTaskTimelineState { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 5b39daaaf8..67fb89c433 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -202,18 +202,13 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::hash_map; -use std::collections::HashMap; -use std::sync::Arc; -use std::sync::Mutex; -use std::sync::Weak; +use std::collections::{HashMap, hash_map}; +use std::sync::{Arc, Mutex, Weak}; use pageserver_api::shard::ShardIdentity; -use tracing::instrument; -use tracing::trace; +use tracing::{instrument, trace}; use utils::id::TimelineId; -use utils::shard::ShardIndex; -use utils::shard::ShardNumber; +use utils::shard::{ShardIndex, ShardNumber}; use crate::tenant::mgr::ShardSelector; @@ -631,12 +626,10 @@ impl HandleInner { mod tests { use std::sync::Weak; - use pageserver_api::{ - key::{rel_block_to_key, Key, DBDIR_KEY}, - models::ShardParameters, - reltag::RelTag, - shard::ShardStripeSize, - }; + use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; + use pageserver_api::models::ShardParameters; + use pageserver_api::reltag::RelTag; + use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; use super::*; diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs index 0ba9753e85..6209b63de4 100644 --- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -3,12 +3,15 @@ //! Provides utilities to spawn and abort a background task where the downloads happen. //! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. +use std::sync::{Arc, Mutex}; + use futures::StreamExt; use http_utils::error::ApiError; -use std::sync::{Arc, Mutex}; use tokio_util::sync::CancellationToken; use utils::sync::gate::Gate; +use crate::context::RequestContext; + use super::Timeline; // This status is not strictly necessary now, but gives us a nice place @@ -29,6 +32,8 @@ impl HeatmapLayersDownloader { fn new( timeline: Arc, concurrency: usize, + recurse: bool, + ctx: RequestContext, ) -> Result { let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; @@ -62,6 +67,7 @@ impl HeatmapLayersDownloader { let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( |layer| { + let ctx = ctx.attached_child(); let tl = timeline.clone(); let dl_guard = match downloads_guard.enter() { Ok(g) => g, @@ -74,7 +80,7 @@ impl HeatmapLayersDownloader { Some(async move { let _dl_guard = dl_guard; - let res = tl.download_layer(&layer.name).await; + let res = tl.download_layer(&layer.name, &ctx).await; if let Err(err) = res { if !err.is_cancelled() { tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") @@ -93,6 +99,20 @@ impl HeatmapLayersDownloader { }, _ = cancel.cancelled() => { tracing::info!("Heatmap layers download cancelled"); + return; + } + } + + if recurse { + if let Some(ancestor) = timeline.ancestor_timeline() { + let ctx = ctx.attached_child(); + let res = + ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx); + if let Err(err) = res { + tracing::info!( + "Failed to start heatmap layers download for ancestor: {err}" + ); + } } } } @@ -135,13 +155,20 @@ impl HeatmapLayersDownloader { } impl Timeline { - pub(crate) async fn start_heatmap_layers_download( + pub(crate) fn start_heatmap_layers_download( self: &Arc, concurrency: usize, + recurse: bool, + ctx: &RequestContext, ) -> Result<(), ApiError> { let mut locked = self.heatmap_layers_downloader.lock().unwrap(); if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { - let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + let dl = HeatmapLayersDownloader::new( + self.clone(), + concurrency, + recurse, + ctx.attached_child(), + )?; *locked = Some(dl); Ok(()) } else { diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6940179ae9..8b94a114d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,14 +1,14 @@ use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::{info, info_span, Instrument}; +use tracing::{Instrument, info, info_span}; use utils::lsn::Lsn; -use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; - use super::Timeline; +use crate::context::RequestContext; +use crate::tenant::metadata::TimelineMetadata; mod flow; mod importbucket_client; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 4388072606..3ef82b3658 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -28,52 +28,38 @@ //! An incomplete set of TODOs from the Hackathon: //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) +use std::collections::HashSet; +use std::ops::Range; use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; - use itertools::Itertools; -use pageserver_api::{ - key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, - reltag::RelTag, - shard::ShardIdentity, -}; -use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; -use tokio::task::JoinSet; -use tracing::{debug, info_span, instrument, Instrument}; - -use crate::{ - assert_u64_eq_usize::UsizeIsU64, - pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::{DbDirectory, RelDirectory}, - task_mgr::TaskKind, - tenant::storage_layer::{ImageLayerWriter, Layer}, -}; - -use pageserver_api::key::Key; use pageserver_api::key::{ - slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, - TWOPHASEDIR_KEY, + CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, + rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_size_to_key, }; -use pageserver_api::keyspace::singleton_range; -use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; -use pageserver_api::reltag::SlruKind; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::relfile_utils::parse_relfilename; +use postgres_ffi::{BLCKSZ, pg_constants}; +use remote_storage::RemotePath; +use tokio::task::JoinSet; +use tracing::{Instrument, debug, info_span, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use std::collections::HashSet; -use std::ops::Range; - -use super::{ - importbucket_client::{ControlFile, RemoteStorageWrapper}, - Timeline, +use super::Timeline; +use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; +use crate::assert_u64_eq_usize::UsizeIsU64; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::{ + DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; - -use remote_storage::RemotePath; +use crate::task_mgr::TaskKind; +use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index 68937e535d..a17a10d56b 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -1,4 +1,5 @@ -use std::{ops::Bound, sync::Arc}; +use std::ops::Bound; +use std::sync::Arc; use anyhow::Context; use bytes::Bytes; @@ -12,9 +13,9 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, instrument}; use utils::lsn::Lsn; -use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; - use super::{importbucket_format, index_part_format}; +use crate::assert_u64_eq_usize::U64IsUsize; +use crate::config::PageServerConf; pub async fn new( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index 310d97a6a9..ea7a41b25f 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -1,7 +1,6 @@ -use serde::{Deserialize, Serialize}; - #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Root { diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index c5210f9a30..7c7a4de2fc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -1,13 +1,12 @@ //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use reqwest::Method; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::error; -use crate::config::PageServerConf; -use reqwest::Method; - use super::importbucket_format::Spec; +use crate::config::PageServerConf; pub struct Client { base_url: String, diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 6634d07a0d..e952df0845 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -1,22 +1,16 @@ -use crate::{ - is_temporary, - tenant::{ - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - self, - index::{IndexPart, LayerFileMetadata}, - }, - storage_layer::LayerName, - }, -}; +use std::collections::{HashMap, hash_map}; +use std::str::FromStr; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use std::{ - collections::{hash_map, HashMap}, - str::FromStr, -}; use utils::lsn::Lsn; +use crate::is_temporary; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; +use crate::tenant::remote_timeline_client::{self}; +use crate::tenant::storage_layer::LayerName; + /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 60e36a5d4d..e552ea83de 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,27 +1,22 @@ -use anyhow::{bail, ensure, Context}; +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, bail, ensure}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; -use std::{collections::HashMap, sync::Arc}; use tracing::trace; -use utils::{ - id::TimelineId, - lsn::{AtomicLsn, Lsn}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - metrics::TimelineMetrics, - tenant::{ - layer_map::{BatchedUpdates, LayerMap}, - storage_layer::{ - AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, - PersistentLayerKey, ResidentLayer, - }, - }, -}; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn}; use super::TimelineWriterState; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::metrics::TimelineMetrics; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::storage_layer::{ + AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, + PersistentLayerKey, ResidentLayer, +}; /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { @@ -214,9 +209,7 @@ impl OpenLayerManager { trace!( "creating in-memory layer at {}/{} for record at {}", - timeline_id, - start_lsn, - lsn + timeline_id, start_lsn, lsn ); let new_layer = diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index f4a4eea54a..397037ca9f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -1,11 +1,10 @@ -use anyhow::Context; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use anyhow::Context; use once_cell::sync::OnceCell; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; - /// Internal structure to hold all data needed for logical size calculation. /// /// Calculation consists of two stages: diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 93e5a1100d..43ffaa6aab 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,11 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; +use super::delete::{DeletionGuard, delete_local_timeline_directory}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; -use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; +use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard}; use crate::tenant::{ DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, }; @@ -143,5 +143,12 @@ fn remove_timeline_from_tenant( .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + // Clear the compaction queue for this timeline + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); + Arc::strong_count(&timeline) } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 3074463384..f66c0ffa0f 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -1,18 +1,21 @@ -use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc}; +use std::collections::hash_map::Entry; +use std::fs; +use std::future::Future; +use std::sync::Arc; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; - -use crate::{ - context::RequestContext, - import_datadir, - span::debug_assert_current_span_has_tenant_and_timeline_id, - tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, -}; +use utils::fs_ext; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::RequestContext; +use crate::import_datadir; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or @@ -128,7 +131,7 @@ impl<'t> UninitializedTimeline<'t> { // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should // skip trying to delete the timeline directory too. anyhow::bail!( - "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ) } Entry::Vacant(v) => { diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 67429bff98..4f80073cc3 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,17 +23,11 @@ mod connection_manager; mod walreceiver_connection; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::timeline::walreceiver::connection_manager::{ - connection_manager_loop_step, ConnectionManagerState, -}; - use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; use std::time::Duration; + use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; @@ -41,8 +35,13 @@ use tracing::*; use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; - use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::walreceiver::connection_manager::{ + ConnectionManagerState, connection_manager_loop_step, +}; #[derive(Clone)] pub struct WalReceiverConf { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 1955345315..df2663f6bb 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -9,45 +9,42 @@ //! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. -use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::Duration; -use super::{TaskStateUpdate, WalReceiverConf}; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; +use pageserver_api::models::TimelineState; +use postgres_connection::PgConnectionConfig; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, + TypedMessage, +}; +use storage_broker::{BrokerClientChannel, Code, Streaming}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, +}; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, +}; + +use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; +use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; -use anyhow::Context; -use chrono::{NaiveDateTime, Utc}; -use pageserver_api::models::TimelineState; - -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, - SubscribeByFilterRequest, TypeSubscription, TypedMessage, -}; -use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio_util::sync::CancellationToken; -use tracing::*; - -use postgres_connection::PgConnectionConfig; -use utils::backoff::{ - exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::postgres_client::{ - wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol, -}; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use super::{ - walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError, - TaskEvent, TaskHandle, -}; +use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; pub(crate) struct Cancelled; @@ -349,7 +346,9 @@ async fn subscribe_for_timeline_updates( Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. - info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); + info!( + "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}" + ); continue; } } @@ -512,11 +511,11 @@ impl ConnectionManagerState { fn spawn( &self, task: impl FnOnce( - tokio::sync::watch::Sender>, - CancellationToken, - ) -> Fut - + Send - + 'static, + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, ) -> TaskHandle where Fut: std::future::Future> + Send, @@ -880,8 +879,7 @@ impl ConnectionManagerState { discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { trace!( "New candidate has commit_lsn {}, higher than current_commit_lsn {}", - candidate_commit_lsn, - current_commit_lsn + candidate_commit_lsn, current_commit_lsn ); Some(NewCommittedWAL { lsn: candidate_commit_lsn, @@ -1048,7 +1046,9 @@ impl ConnectionManagerState { if !node_ids_to_remove.is_empty() { for node_id in node_ids_to_remove { - info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + info!( + "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" + ); self.wal_connection_retries.remove(&node_id); WALRECEIVER_CANDIDATES_REMOVED.inc(); } @@ -1119,11 +1119,12 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use super::*; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + fn dummy_broker_sk_timeline( commit_lsn: u64, safekeeper_connstr: &str, diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index bb34a181da..f41a9cfe82 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -1,46 +1,48 @@ //! Actual Postgres connection handler to stream WAL to the server. -use std::{ - error::Error, - pin::pin, - str::FromStr, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::error::Error; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres_ffi::WAL_SEGMENT_SIZE; -use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError}; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use tokio::{select, sync::watch, time}; -use tokio_postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; -use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn, Instrument}; -use wal_decoder::{ - models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}, - wire_format::FromWireFormat, -}; - -use super::TaskStateUpdate; -use crate::{ - context::RequestContext, - metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - pgdatadir_mapping::DatadirModification, - task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, - tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, - walingest::WalIngest, -}; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; -use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; -use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::sync::watch; +use tokio::{select, time}; +use tokio_postgres::error::SqlState; +use tokio_postgres::replication::ReplicationStream; +use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, trace, warn}; +use utils::critical; +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; +use utils::sync::gate::GateError; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::wire_format::FromWireFormat; + +use super::TaskStateUpdate; +use crate::context::RequestContext; +use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS}; +use crate::pgdatadir_mapping::DatadirModification; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::{ + Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id, +}; +use crate::walingest::WalIngest; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -149,7 +151,9 @@ pub(super) async fn handle_walreceiver_connection( // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. - info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open"); + info!( + "Timed out while waiting {connect_timeout:?} for walreceiver connection to open" + ); return Ok(()); } } @@ -166,7 +170,9 @@ pub(super) async fn handle_walreceiver_connection( node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped right after connection init, aborting the connection: {e}" + ); return Ok(()); } @@ -227,7 +233,9 @@ pub(super) async fn handle_walreceiver_connection( connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}" + ); return Ok(()); } @@ -254,7 +262,9 @@ pub(super) async fn handle_walreceiver_connection( // to the safekeepers. startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); - info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); + info!( + "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..." + ); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); @@ -626,7 +636,9 @@ pub(super) async fn handle_walreceiver_connection( let timestamp = keepalive.timestamp(); let reply_requested = keepalive.reply() != 0; - trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + trace!( + "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})" + ); if reply_requested { Some(last_rec_lsn) diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index d302205ffe..d5dc9666ce 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,21 +1,18 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Debug; -use std::sync::atomic::AtomicU32; use std::sync::Arc; - -use super::remote_timeline_client::is_same_remote_layer_path; -use super::storage_layer::AsLayerDesc as _; -use super::storage_layer::LayerName; -use super::storage_layer::ResidentLayer; -use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use utils::generation::Generation; -use utils::lsn::{AtomicLsn, Lsn}; +use std::sync::atomic::AtomicU32; use chrono::NaiveDateTime; use once_cell::sync::Lazy; use tracing::info; +use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; /// Kill switch for upload queue reordering in case it causes problems. /// TODO: remove this once we have confidence in it. @@ -225,7 +222,7 @@ impl UploadQueueInitialized { // most one of them can be an index upload (enforced by can_bypass). .scan(&self.clean.0, |next_active_index, op| { let active_index = *next_active_index; - if let UploadOp::UploadMetadata { ref uploaded } = op { + if let UploadOp::UploadMetadata { uploaded } = op { *next_active_index = uploaded; // stash index for next operation after this } Some((op, active_index)) @@ -562,16 +559,18 @@ impl UploadOp { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; - use crate::tenant::storage_layer::layer::local_layer_path; - use crate::tenant::storage_layer::Layer; - use crate::tenant::Timeline; - use crate::DEFAULT_PG_VERSION; - use itertools::Itertools as _; use std::str::FromStr as _; + + use itertools::Itertools as _; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::Timeline; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::Layer; + use crate::tenant::storage_layer::layer::local_layer_path; + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. #[track_caller] fn assert_same_op(a: &UploadOp, b: &UploadOp) { @@ -690,10 +689,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let (barrier, _) = tokio::sync::watch::channel(()); // Enqueue non-conflicting upload, delete, and index before and after a barrier. @@ -757,10 +768,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of deletes, some with conflicting names. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::Delete(Delete { @@ -802,9 +825,21 @@ mod tests { let tli = make_timeline(); // Enqueue three versions of the same layer, with different file sizes. - let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1); - let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2); - let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3); + let layer0a = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 1, + ); + let layer0b = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 2, + ); + let layer0c = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 3, + ); let ops = [ UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), @@ -836,8 +871,14 @@ mod tests { // Enqueue two layer uploads, with a delete of both layers in between them. These should be // scheduled one at a time, since deletes can't bypass uploads and vice versa. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -878,10 +919,22 @@ mod tests { // // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue // and run immediately. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -916,9 +969,18 @@ mod tests { let tli = make_timeline(); // Enqueue three different layer uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), @@ -981,11 +1043,20 @@ mod tests { // Enqueue three uploads of the current empty index. let index = Box::new(queue.clean.0.clone()); - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index0 = index_with(&index, &layer0); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index1 = index_with(&index0, &layer1); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index2 = index_with(&index1, &layer2); let ops = [ @@ -1045,7 +1116,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let index_upload = index_with(&queue.clean.0, &layer); // Remove the layer reference in a new index, then delete the layer. @@ -1090,7 +1164,10 @@ mod tests { let tli = make_timeline(); // Create a layer to upload. - let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Upload the layer. Then dereference the layer, and upload/reference it again. let index_upload = index_with(&queue.clean.0, &layer); @@ -1138,10 +1215,22 @@ mod tests { let tli = make_timeline(); let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); // Enqueue non-conflicting upload, delete, and index before and after a shutdown. let ops = [ @@ -1197,10 +1286,22 @@ mod tests { let tli = make_timeline(); // Enqueue a bunch of uploads. - let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); - let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); let ops = [ UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 47fb4a276b..dcf17a376c 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -27,8 +27,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] @@ -139,7 +138,10 @@ impl VectoredBlob { bits => { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end), + format!( + "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", + self.meta.key, self.meta.lsn, self.start, self.end + ), ); Err(error) } @@ -677,13 +679,12 @@ impl StreamingVectoredReadPlanner { mod tests { use anyhow::Error; + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; + use super::*; use crate::context::DownloadBehavior; use crate::page_cache::PAGE_SZ; use crate::task_mgr::TaskKind; - use super::super::blob_io::tests::{random_array, write_maybe_compressed}; - use super::*; - fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; assert_eq!(read.start % ALIGN, 0); diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 093a944777..29d1a31aaf 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -3,13 +3,15 @@ //! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the //! truth. -use anyhow::Context; use std::path::Path; + +use anyhow::Context; +use pageserver_api::models::PageserverUtilization; use utils::serde_percent::Percent; -use pageserver_api::models::PageserverUtilization; - -use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; +use crate::config::PageServerConf; +use crate::metrics::NODE_UTILIZATION_SCORE; +use crate::tenant::mgr::TenantManager; pub(crate) fn regenerate( conf: &PageServerConf, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c966ad813f..b47aecf8a6 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,11 +11,13 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::context::RequestContext; -use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; +use std::fs::File; +use std::io::{Error, ErrorKind, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; -use crate::page_cache::{PageWriteGuard, PAGE_SZ}; -use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; @@ -23,31 +25,30 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; +pub use pageserver_api::models::virtual_file as api; use pageserver_api::shard::TenantShardId; -use std::fs::File; -use std::io::{Error, ErrorKind, Seek, SeekFrom}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; - -use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; -pub use pageserver_api::models::virtual_file as api; +use crate::context::RequestContext; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation}; +use crate::page_cache::{PAGE_SZ, PageWriteGuard}; +use crate::tenant::TENANTS_SEGMENT_NAME; pub(crate) mod io_engine; -pub use io_engine::feature_test as io_engine_feature_test; -pub use io_engine::io_engine_for_bench; -pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +pub use io_engine::{ + FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, + io_engine_for_bench, +}; mod metadata; mod open_options; -use self::owned_buffers_io::write::OwnedAsyncWriter; pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +use self::owned_buffers_io::write::OwnedAsyncWriter; + pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! @@ -1078,7 +1079,8 @@ where #[cfg(test)] mod test_read_exact_at_impl { - use std::{collections::VecDeque, sync::Arc}; + use std::collections::VecDeque; + use std::sync::Arc; use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; @@ -1424,19 +1426,19 @@ static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); #[cfg(test)] mod tests { - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; - - use super::*; - use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; - use rand::seq::SliceRandom; - use rand::thread_rng; - use rand::Rng; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; + use rand::seq::SliceRandom; + use rand::{Rng, thread_rng}; + + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + enum MaybeVirtualFile { VirtualFile(VirtualFile), File(File), diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index ccde90ee1a..758dd6e377 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -80,7 +80,9 @@ pub(crate) fn get() -> IoEngine { Ok(v) => match v.parse::() { Ok(engine_kind) => engine_kind, Err(e) => { - panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + panic!( + "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}" + ) } }, Err(std::env::VarError::NotPresent) => { @@ -107,15 +109,12 @@ pub(crate) fn get() -> IoEngine { } } -use std::{ - os::unix::prelude::FileExt, - sync::atomic::{AtomicU8, Ordering}, -}; +use std::os::unix::prelude::FileExt; +use std::sync::atomic::{AtomicU8, Ordering}; -use super::{ - owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, - FileGuard, Metadata, -}; +use super::owned_buffers_io::io_buf_ext::FullSlice; +use super::owned_buffers_io::slice::SliceMutExt; +use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs index c67215492f..ad17405b64 100644 --- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -5,18 +5,16 @@ //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. //! See for more details. -use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; - -use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use tokio_epoll_uring::{System, SystemHandle}; - -use crate::virtual_file::on_fatal_io_error; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, error, info, info_span, warn}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; +use crate::virtual_file::on_fatal_io_error; #[derive(Clone)] struct ThreadLocalState(Arc); @@ -194,7 +192,7 @@ impl std::ops::Deref for Handle { fn deref(&self) -> &Self::Target { self.0 - .0 + .0 .cell .get() .expect("must be already initialized when using this") diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7f951270d1..e188b8649b 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,7 +1,9 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; +use std::os::fd::OwnedFd; +use std::path::Path; + use super::io_engine::IoEngine; -use std::{os::fd::OwnedFd, path::Path}; #[derive(Debug, Clone)] pub enum OpenOptions { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index a5c26cd746..090d2ece85 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -1,9 +1,9 @@ -use std::{ - ops::{Deref, Range, RangeBounds}, - sync::Arc, -}; +use std::ops::{Deref, Range, RangeBounds}; +use std::sync::Arc; -use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; +use super::alignment::Alignment; +use super::raw::RawAlignedBuffer; +use super::{AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index d2f5e206bb..df5c911e50 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,13 +1,9 @@ -use std::{ - mem::MaybeUninit, - ops::{Deref, DerefMut}, -}; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; -use super::{ - alignment::{Alignment, ConstAlign}, - buffer::AlignedBuffer, - raw::RawAlignedBuffer, -}; +use super::alignment::{Alignment, ConstAlign}; +use super::buffer::AlignedBuffer; +use super::raw::RawAlignedBuffer; /// A mutable aligned buffer type. #[derive(Debug)] @@ -75,7 +71,8 @@ impl AlignedBufferMut { /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { - self.raw.set_len(new_len) + // SAFETY: the caller is unsafe + unsafe { self.raw.set_len(new_len) } } #[inline] @@ -222,8 +219,10 @@ unsafe impl bytes::BufMut for AlignedBufferMut { panic_advance(cnt, remaining); } - // Addition will not overflow since the sum is at most the capacity. - self.set_len(len + cnt); + // SAFETY: Addition will not overflow since the sum is at most the capacity. + unsafe { + self.set_len(len + cnt); + } } #[inline] @@ -275,7 +274,10 @@ unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { unsafe fn set_init(&mut self, init_len: usize) { if self.len() < init_len { - self.set_len(init_len); + // SAFETY: caller function is unsafe + unsafe { + self.set_len(init_len); + } } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 6c26dec0db..97a6c4049a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -1,9 +1,7 @@ use core::slice; -use std::{ - alloc::{self, Layout}, - cmp, - mem::ManuallyDrop, -}; +use std::alloc::{self, Layout}; +use std::cmp; +use std::mem::ManuallyDrop; use super::alignment::{Alignment, ConstAlign}; diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 525f447b6d..4c671c2652 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,11 +1,12 @@ //! See [`FullSlice`]. -use crate::virtual_file::{IoBuffer, IoBufferMut}; -use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; + +use bytes::{Bytes, BytesMut}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use super::write::CheapCloneForRead; +use crate::virtual_file::{IoBuffer, IoBufferMut}; /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index 6100593663..9f4a05dd57 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -1,7 +1,4 @@ -use tokio_epoll_uring::BoundedBuf; -use tokio_epoll_uring::BoundedBufMut; -use tokio_epoll_uring::IoBufMut; -use tokio_epoll_uring::Slice; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice}; pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. @@ -35,10 +32,11 @@ where mod tests { use std::io::Read; - use super::*; use bytes::Buf; use tokio_epoll_uring::Slice; + use super::*; + #[test] fn test_slice_full_zeroed() { let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 7299d83703..861ca3aa2a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,20 +1,14 @@ mod flush; use std::sync::Arc; +pub(crate) use flush::FlushControl; use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::{ - context::RequestContext, - virtual_file::{IoBuffer, IoBufferMut}, -}; - -use super::{ - io_buf_aligned::IoBufAligned, - io_buf_ext::{FullSlice, IoBufExt}, -}; - -pub(crate) use flush::FlushControl; +use super::io_buf_aligned::IoBufAligned; +use super::io_buf_ext::{FullSlice, IoBufExt}; +use crate::context::RequestContext; +use crate::virtual_file::{IoBuffer, IoBufferMut}; pub(crate) trait CheapCloneForRead { /// Returns a cheap clone of the buffer. diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 9ce8b311bb..46309d4011 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -2,12 +2,10 @@ use std::sync::Arc; use utils::sync::duplex; -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, -}; - use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; +use crate::context::RequestContext; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; /// A handle to the flush task. pub struct FlushHandle { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 45c87353a7..18df065f76 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,39 +22,35 @@ //! bespoken Rust code. use std::collections::HashMap; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use std::time::Instant; -use std::time::SystemTime; +use std::sync::{Arc, OnceLock}; +use std::time::{Duration, Instant, SystemTime}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use tracing::*; - -use crate::context::RequestContext; -use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::{DatadirModification, Version}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; -use crate::ZERO_PAGE; use pageserver_api::key::rel_block_to_key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::fsm_logical_to_physical; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::walrecord::*; -use postgres_ffi::TransactionId; -use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; +use postgres_ffi::{ + TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, + fsm_logical_to_physical, pg_constants, +}; +use tracing::*; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; use wal_decoder::models::*; +use crate::ZERO_PAGE; +use crate::context::RequestContext; +use crate::metrics::WAL_INGEST; +use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{PageReconstructError, Timeline}; + enum_pgversion! {CheckPoint, pgv::CheckPoint} impl CheckPoint { @@ -302,7 +298,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + bail!( + "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" + ); } epoch -= 1; } @@ -796,9 +794,7 @@ impl WalIngest { // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", - xl_xid, - parsed.xid, - lsn, + xl_xid, parsed.xid, lsn, ); let xid: u64 = if modification.tline.pg_version >= 17 { @@ -1130,16 +1126,14 @@ impl WalIngest { let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - cp.oldestXid + xlog_checkpoint.oldestXid, cp.oldestXid ); if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { cp.oldestXid = xlog_checkpoint.oldestXid; } trace!( "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - cp.oldestActiveXid + xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid ); // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, @@ -1368,8 +1362,9 @@ impl WalIngest { // with zero pages. Logging is rate limited per pg version to // avoid skewing. if gap_blocks_filled > 0 { - use once_cell::sync::Lazy; use std::sync::Mutex; + + use once_cell::sync::Lazy; use utils::rate_limit::RateLimit; struct RateLimitPerPgVersion { @@ -1475,10 +1470,7 @@ impl WalIngest { if new_nblocks > old_nblocks { trace!( "extending SLRU {:?} seg {} from {} to {} blocks", - kind, - segno, - old_nblocks, - new_nblocks + kind, segno, old_nblocks, new_nblocks ); modification.put_slru_extend(kind, segno, new_nblocks)?; @@ -1517,13 +1509,13 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::*; - use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::storage_layer::IoConcurrency; use postgres_ffi::RELSEG_SIZE; + use super::*; use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path}; + use crate::tenant::storage_layer::IoConcurrency; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1606,10 +1598,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) @@ -1997,10 +1991,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline @@ -2230,9 +2226,10 @@ mod tests { /// without waiting for unrelated steps. #[tokio::test] async fn test_ingest_real_wal() { - use crate::tenant::harness::*; - use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::waldecoder::WalStreamDecoder; + + use crate::tenant::harness::*; // Define test data path and constants. // diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 027a6eb7d7..22d8d83811 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -24,26 +24,27 @@ mod process; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; -use crate::config::PageServerConf; -use crate::metrics::{ - WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; -use std::future::Future; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, +}; + /// The real implementation that uses a Postgres process to /// perform WAL replay. /// @@ -547,15 +548,18 @@ impl PostgresRedoManager { #[cfg(test)] mod tests { - use super::PostgresRedoManager; - use crate::config::PageServerConf; + use std::str::FromStr; + use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; - use std::str::FromStr; use tracing::Instrument; - use utils::{id::TenantId, lsn::Lsn}; + use utils::id::TenantId; + use utils::lsn::Lsn; + + use super::PostgresRedoManager; + use crate::config::PageServerConf; #[tokio::test] async fn test_ping() { diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index d62e325310..61ae1eb970 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -4,13 +4,12 @@ use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, pg_constants}; use tracing::*; use utils::lsn::Lsn; diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bf30b92ea5..6d4a38d4ff 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -2,28 +2,28 @@ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - page_cache::PAGE_SZ, - span::debug_assert_current_span_has_tenant_id, -}; +use std::collections::VecDeque; +use std::process::{Command, Stdio}; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::time::Duration; + use anyhow::Context; use bytes::Bytes; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use pageserver_api::reltag::RelTag; +use pageserver_api::shard::TenantShardId; use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; +use tracing::{Instrument, debug, error, instrument}; +use utils::lsn::Lsn; +use utils::poison::Poison; + +use self::no_leak_child::NoLeakChild; +use crate::config::PageServerConf; +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause}; +use crate::page_cache::PAGE_SZ; +use crate::span::debug_assert_current_span_has_tenant_id; pub struct WalRedoProcess { #[allow(dead_code)] @@ -136,7 +136,9 @@ impl WalRedoProcess { Ok(0) => break Ok(()), // eof Ok(num_bytes) => { let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); + if !output.contains("LOG:") { + error!(%output, "received output"); + } } Err(e) => { break Err(e); diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs index 1a0d7039df..9939fc4b36 100644 --- a/pageserver/src/walredo/process/no_leak_child.rs +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -1,19 +1,11 @@ -use tracing::instrument; -use tracing::{error, info}; - -use crate::metrics::WalRedoKillCause; -use crate::metrics::WAL_REDO_PROCESS_COUNTERS; - use std::io; -use std::process::Command; - -use std::ops::DerefMut; - -use std::ops::Deref; - -use std::process::Child; +use std::ops::{Deref, DerefMut}; +use std::process::{Child, Command}; use pageserver_api::shard::TenantShardId; +use tracing::{error, info, instrument}; + +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause}; /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index fc1aecd340..49f12bbb9e 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,6 +14,8 @@ */ #include "postgres.h" +#include + #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" @@ -61,6 +63,9 @@ int neon_protocol_version = 2; static int max_reconnect_attempts = 60; static int stripe_size; +static int pageserver_response_log_timeout = 10000; +static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */ + typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; @@ -129,6 +134,11 @@ typedef struct uint64 nrequests_sent; uint64 nresponses_received; + /* State for the receive timeout mechanism in call_PQgetCopyData() */ + instr_time receive_start_time; /* when we started waiting */ + instr_time receive_last_log_time; /* when we last printed a log message for the wait */ + bool receive_logged; /* has the wait been logged */ + /*--- * WaitEventSet containing: * - WL_SOCKET_READABLE on 'conn' @@ -661,6 +671,9 @@ pageserver_connect(shardno_t shard_no, int elevel) shard->state = PS_Connected; shard->nrequests_sent = 0; shard->nresponses_received = 0; + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; } /* FALLTHROUGH */ case PS_Connected: @@ -680,6 +693,33 @@ pageserver_connect(shardno_t shard_no, int elevel) Assert(false); } +static void +get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) +{ + *sndbuf = -1; + *recvbuf = -1; + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + if (socketfd != -1) + { + int ioctl_err; + + ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf); + if (ioctl_err!= 0) { + *sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, recvbuf); + if (ioctl_err != 0) { + *recvbuf = -errno; + } + } +#endif +} + /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ @@ -690,46 +730,54 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; instr_time now, - start_ts, since_start, - last_log_ts, since_last_log; - bool logged = false; - - /* - * As a debugging aid, if we don't get a response for a long time, print a - * log message. - * - * 10 s is a very generous threshold, normally we expect a response in a - * few milliseconds. We have metrics to track latencies in normal ranges, - * but in the cases that take exceptionally long, it's useful to log the - * exact timestamps. - */ -#define LOG_INTERVAL_MS INT64CONST(10 * 1000) - - INSTR_TIME_SET_CURRENT(now); - start_ts = last_log_ts = now; - INSTR_TIME_SET_ZERO(since_last_log); retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); if (ret == 0) { - WaitEvent event; + WaitEvent occurred_event; + int noccurred; + double log_timeout, + disconnect_timeout; long timeout; - timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log)); + /* + * Calculate time elapsed since the start, and since the last progress + * log message. On first call, remember the start time. + */ + INSTR_TIME_SET_CURRENT(now); + if (INSTR_TIME_IS_ZERO(shard->receive_start_time)) + { + shard->receive_start_time = now; + INSTR_TIME_SET_ZERO(since_start); + shard->receive_last_log_time = now; + INSTR_TIME_SET_ZERO(since_last_log); + shard->receive_logged = false; + } + else + { + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + } - /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, timeout, &event, 1, - WAIT_EVENT_NEON_PS_READ); + /* Sleep until the log or disconnect timeout is reached. */ + log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log)); + disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start)); + timeout = (long) ceil(Min(log_timeout, disconnect_timeout)); + + noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0) { if (!PQconsumeInput(pageserver_conn)) { @@ -739,49 +787,61 @@ retry: pfree(msg); return -1; } + goto retry; + } + + /* Timeout was reached, or we were interrupted for some other reason */ + INSTR_TIME_SET_CURRENT(now); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + + /* + * As a debugging aid, if we don't get a response to a pageserver request + * for a long time, print a log message. + * + * The default neon.pageserver_response_log_timeout value, 10 s, is + * very generous. Normally we expect a response in a few + * milliseconds. We have metrics to track latencies in normal ranges, + * but in the cases that take exceptionally long, it's useful to log + * the exact timestamps. + */ + if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) + { + int sndbuf; + int recvbuf; + + get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); + + neon_shard_log(shard_no, LOG, + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", + INSTR_TIME_GET_DOUBLE(since_start), + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); + shard->receive_last_log_time = now; + shard->receive_logged = true; } /* - * Print a message to the log if a long time has passed with no - * response. + * If an even longer time has passed without receiving a response from + * the pageserver, disconnect. That triggers a reconnection attempt + * in the caller. + * + * If this happens, the pageserver is likely dead and isn't coming + * back, or there's some kind of a network glitch and the connection + * is permanently gone. Without this, if the pageserver or the network + * connection is dead, it could take a very long time (15 minutes or + * more) until the TCP keepalive timeout notices that. Even if we + * would in fact get a response if we just waited a little longer, + * there's a good chance that we'll get the response sooner by + * reconnecting. */ - INSTR_TIME_SET_CURRENT(now); - since_last_log = now; - INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); - if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS) + if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { - int sndbuf = -1; - int recvbuf = -1; -#ifdef __linux__ - int socketfd; -#endif - - since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - -#ifdef __linux__ - /* - * get kernel's send and recv queue size via ioctl - * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 - */ - socketfd = PQsocket(pageserver_conn); - if (socketfd != -1) { - int ioctl_err; - ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf); - if (ioctl_err!= 0) { - sndbuf = -errno; - } - ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf); - if (ioctl_err != 0) { - recvbuf = -errno; - } - } -#endif - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", - INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); - last_log_ts = now; - logged = true; + neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting", + INSTR_TIME_GET_DOUBLE(since_start)); + pageserver_disconnect(shard_no); + return -1; } goto retry; @@ -791,14 +851,18 @@ retry: * If we logged earlier that the response is taking a long time, log * another message when the response is finally received. */ - if (logged) + if (shard->receive_logged) { INSTR_TIME_SET_CURRENT(now); since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s", + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + neon_shard_log(shard_no, LOG, + "received response from pageserver after %0.3f s", INSTR_TIME_GET_DOUBLE(since_start)); } + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; return ret; } @@ -972,9 +1036,17 @@ pageserver_receive(shardno_t shard_no) pfree(msg); } } + else if (rc == -1 && shard->state == PS_Disconnected) + { + /* If the state is 'Disconnected', the disconnection message was already logged */ + resp = NULL; + } else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg); + pfree(msg); pageserver_disconnect(shard_no); resp = NULL; } @@ -1027,6 +1099,10 @@ pageserver_try_receive(shardno_t shard_no) { neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); + /* + * Malformed responses from PageServer are a reason to raise + * errors and cancel transactions. + */ PG_RE_THROW(); } PG_END_TRY(); @@ -1050,7 +1126,8 @@ pageserver_try_receive(shardno_t shard_no) char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: could not read COPY data: %s", msg); + resp = NULL; } else { @@ -1249,6 +1326,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.readahead_getpage_pull_timeout", + "readahead response pull timeout", + "Time between active tries to pull data from the " + "PageStream connection when we have pages which " + "were read ahead but not yet received.", + &readahead_getpage_pull_timeout_ms, + 0, 0, 5 * 60 * 1000, + PGC_USERSET, + GUC_UNIT_MS, + NULL, NULL, NULL); DefineCustomIntVariable("neon.protocol_version", "Version of compute<->page server protocol", NULL, @@ -1260,6 +1347,26 @@ pg_init_libpagestore(void) 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.pageserver_response_log_timeout", + "pageserver response log timeout", + "If the pageserver doesn't respond to a request within this timeout, " + "a message is printed to the log.", + &pageserver_response_log_timeout, + 10000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", + "pageserver response diconnect timeout", + "If the pageserver doesn't respond to a request within this timeout, " + "disconnect and reconnect.", + &pageserver_response_disconnect_timeout, + 120000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + relsize_hash_init(); if (page_server != NULL) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 700a942284..4b448ba5f6 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,7 @@ #include "fmgr.h" #include "miscadmin.h" +#include "pgstat.h" #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" @@ -410,6 +411,16 @@ ReportSearchPath(void) } } +#if PG_VERSION_NUM < 150000 +/* + * PG14 uses separate backend for stats collector having no access to shared memory. + * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file + * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit + * variable, so we have to declare it here. + */ +static int neon_pgstat_file_size_limit; +#endif + void _PG_init(void) { @@ -426,6 +437,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + pagestore_smgr_init(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); @@ -467,6 +479,15 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomIntVariable("neon.pgstat_file_size_limit", + "Maximal size of pgstat.stat file saved in Neon storage", + "Zero value disables persisting pgstat.stat file", + &neon_pgstat_file_size_limit, + 0, 0, 1000000, /* disabled by default */ + PGC_SIGHUP, + GUC_UNIT_KB, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 912e09c3d3..7686ce076b 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -22,6 +22,7 @@ extern char *neon_tenant; extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; +extern int readahead_getpage_pull_timeout_ms; #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; @@ -49,6 +50,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fb4ed9522..1fad44bd58 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes) return true; } +/* -------------------------------- + * pq_getmsgint16 - get a binary 2-byte int from a message buffer + * -------------------------------- + */ +uint16 +pq_getmsgint16(StringInfo msg) +{ + return pq_getmsgint(msg, 2); +} + +/* -------------------------------- + * pq_getmsgint32 - get a binary 4-byte int from a message buffer + * -------------------------------- + */ +uint32 +pq_getmsgint32(StringInfo msg) +{ + return pq_getmsgint(msg, 4); +} + /* -------------------------------- * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order * -------------------------------- diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 89683714f1..7480ac28cc 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -8,6 +8,8 @@ #endif bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint16 pq_getmsgint16(StringInfo msg); +uint32 pq_getmsgint32(StringInfo msg); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 9faab1e4f0..475697f9c0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -209,7 +209,11 @@ typedef struct NeonResponse *(*receive) (shardno_t shard_no); /* * Try get the next response from the TCP buffers, if any. - * Returns NULL when the data is not yet available. + * Returns NULL when the data is not yet available. + * + * This will raise errors only for malformed responses (we can't put them + * back into connection). All other error conditions are soft errors and + * return NULL as "no response available". */ NeonResponse *(*try_receive) (shardno_t shard_no); /* diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 4a79acd777..fe463fd4a6 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -65,10 +65,12 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" +#include "utils/timeout.h" +#include "bitmap.h" +#include "neon.h" #include "neon_perf_counters.h" #include "pagestore_client.h" -#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -123,6 +125,45 @@ static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); static uint32 local_request_counter; #define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void reconfigure_timeout_if_needed(void); +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + /* * Prefetch implementation: * @@ -221,7 +262,6 @@ typedef struct PrfHashEntry #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" -#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -407,17 +447,26 @@ compact_prefetch_buffers(void) } /* - * If there might be responses still in the TCP buffer, then - * we should try to use those, so as to reduce any TCP backpressure - * on the OS/PS side. + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. * * This procedure handles that. * - * Note that this is only valid as long as the only pipelined - * operations in the TCP buffer are getPage@Lsn requests. + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. */ static void -prefetch_pump_state(void) +prefetch_pump_state(bool IsHandlingInterrupts) { while (MyPState->ring_receive != MyPState->ring_flush) { @@ -466,6 +515,12 @@ prefetch_pump_state(void) } } } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + reconfigure_timeout_if_needed(); } void @@ -581,8 +636,8 @@ readahead_buffer_resize(int newsize, void *extra) /* * Make sure that there are no responses still in the buffer. * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. */ static void consume_prefetch_responses(void) @@ -639,6 +694,7 @@ static bool prefetch_wait_for(uint64 ring_index) { PrefetchRequest *entry; + bool result = true; if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) @@ -652,13 +708,21 @@ prefetch_wait_for(uint64 ring_index) while (MyPState->ring_receive <= ring_index) { + START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); if (!prefetch_read(entry)) - return false; + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); } - return true; + + return result; } /* @@ -967,6 +1031,7 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n BITMAP_SET(mask, i); hits += 1; + inc_getpage_wait(0); } } pgBufferUsage.prefetch.hits += hits; @@ -1315,6 +1380,12 @@ page_server_request(void const *req) page_server->disconnect(shard_no); MyNeonCounters->pageserver_open_requests = 0; + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + PG_RE_THROW(); } PG_END_TRY(); @@ -2942,7 +3013,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MyPState->ring_last <= ring_index); } - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -2985,7 +3056,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -3029,7 +3100,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3277,7 +3348,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); @@ -3299,7 +3370,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3410,7 +3481,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -3455,7 +3526,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3625,7 +3696,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3680,7 +3751,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3971,7 +4042,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4272,6 +4343,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf } pfree(resp); + reconfigure_timeout_if_needed(); return n_blocks; } @@ -4307,6 +4379,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } + reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4563,3 +4636,94 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } + +static void +reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +static process_interrupts_callback_t prev_interrupt_cb; + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +pagestore_smgr_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + prefetch_pump_state(true); + + timeout_signaled = false; + reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} + + +void +pagestore_smgr_init(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = pagestore_smgr_processinterrupts; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7472fd6afc..356895aa82 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); +static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version); static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); @@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); +static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } wp->quorum = wp->n_safekeepers / 2 + 1; + if (wp->config->proto_version != 2 && wp->config->proto_version != 3) + wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + /* Fill the greeting package */ - wp->greetRequest.tag = 'g'; - wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - wp->greetRequest.pgVersion = PG_VERSION_NUM; - wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); - wp->greetRequest.systemId = wp->config->systemId; - if (!wp->config->neon_timeline) - wp_log(FATAL, "neon.timeline_id is not provided"); - if (*wp->config->neon_timeline != '\0' && - !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp->greetRequest.pam.tag = 'g'; if (!wp->config->neon_tenant) wp_log(FATAL, "neon.tenant_id is not provided"); - if (*wp->config->neon_tenant != '\0' && - !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); - - wp->greetRequest.timeline = wp->config->pgTimeline; - wp->greetRequest.walSegSize = wp->config->wal_segment_size; + wp->greetRequest.tenant_id = wp->config->neon_tenant; + if (!wp->config->neon_timeline) + wp_log(FATAL, "neon.timeline_id is not provided"); + wp->greetRequest.timeline_id = wp->config->neon_timeline; + wp->greetRequest.pg_version = PG_VERSION_NUM; + wp->greetRequest.system_id = wp->config->systemId; + wp->greetRequest.wal_seg_size = wp->config->wal_segment_size; wp->api.init_event_set(wp); @@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) void WalProposerFree(WalProposer *wp) { + MembershipConfigurationFree(&wp->mconf); for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; Assert(sk->outbuf.data != NULL); pfree(sk->outbuf.data); + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -598,11 +600,14 @@ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; +#define CMD_LEN 512 + char cmd[CMD_LEN]; - if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version); + if (!wp->api.conn_send_query(sk, cmd)) { - wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", + cmd, sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk) /* * Start handshake: first of all send information about the - * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for * a response to finish the handshake. */ static void SendProposerGreeting(Safekeeper *sk) { + WalProposer *wp = sk->wp; + char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf); + + wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml); + pfree(mconf_toml); + + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest, + &sk->outbuf, wp->config->proto_version); + /* * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ - BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV); + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } static void RecvAcceptorGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; + char *mconf_toml; /* * If our reading doesn't immediately succeed, any necessary error @@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); + mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT, + sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); + pfree(mconf_toml); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk) wp->propTerm++; wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->voteRequest = (VoteRequest) - { - .tag = 'v', - .term = wp->propTerm - }; - memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN); + wp->voteRequest.pam.tag = 'v'; + wp->voteRequest.generation = wp->mconf.generation; + wp->voteRequest.term = wp->propTerm; } } else if (sk->greetResponse.term > wp->propTerm) @@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk) { WalProposer *wp = sk->wp; - /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); - /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) - return; + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest, + &sk->outbuf, wp->config->proto_version); + /* We have quorum for voting, send our vote request */ + wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, + wp->voteRequest.generation, wp->voteRequest.term); + /* On failure, logging & resetting is handled */ + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ } @@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + sk->voteResponse.voteGiven, + GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp) * otherwise we must be sync-safekeepers and we have nothing to do then. * * Proceeding is not only pointless but harmful, because we'd give - * safekeepers term history starting with 0/0. These hacks will go away once - * we disable implicit timeline creation on safekeepers and create it with - * non zero LSN from the start. + * safekeepers term history starting with 0/0. These hacks will go away + * once we disable implicit timeline creation on safekeepers and create it + * with non zero LSN from the start. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr) { @@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; wp->truncateLsn = InvalidXLogRecPtr; - wp->timelineStartLsn = InvalidXLogRecPtr; for (int i = 0; i < wp->n_safekeepers; i++) { @@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->donor = i; } wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); - - if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) - { - /* timelineStartLsn should be the same everywhere or unknown */ - if (wp->timelineStartLsn != InvalidXLogRecPtr && - wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) - { - wp_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); - } - wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; - } } } @@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); - if (wp->timelineStartLsn == InvalidXLogRecPtr) - { - wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); - } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); - /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so - * it should never be zero at this point, if we know timelineStartLsn. - * - * timelineStartLsn can be zero only on the first syncSafekeepers run. - */ - Assert((wp->truncateLsn != InvalidXLogRecPtr) || - (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn)); + Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers); /* * We will be generating WAL since propEpochStartLsn, so we should set @@ -1053,10 +1045,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if last_log_term on the node which gave - * the highest vote (i.e. point where we are going to start writing) - * actually had been won by me; plain restart of walproposer not - * intervened by concurrent compute which wrote WAL is ok. + * However, allow to proceed if last_log_term on the node which + * gave the highest vote (i.e. point where we are going to start + * writing) actually had been won by me; plain restart of + * walproposer not intervened by concurrent compute which wrote + * WAL is ok. * * This avoids compute crash after manual term_bump. */ @@ -1126,14 +1119,8 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - - /* - * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline - * is created manually (test_s3_wal_replay) - */ - Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries); } else { @@ -1158,29 +1145,19 @@ SendProposerElected(Safekeeper *sk) Assert(sk->startStreamingAt <= wp->availableLsn); - msg.tag = 'e'; + msg.apm.tag = 'e'; + msg.generation = wp->mconf.generation; msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; msg.termHistory = &wp->propTermHistory; - msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0; wp_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); - - resetStringInfo(&sk->outbuf); - pq_sendint64_le(&sk->outbuf, msg.tag); - pq_sendint64_le(&sk->outbuf, msg.term); - pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); - pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); - for (int i = 0; i < msg.termHistory->n_entries; i++) - { - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); - } - pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s", + sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), + lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version); if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) return; @@ -1246,14 +1223,13 @@ static void PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); - req->tag = 'a'; + req->apm.tag = 'a'; + req->generation = wp->mconf.generation; req->term = wp->propTerm; - req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; - req->proposerId = wp->greetRequest.proposerId; } /* @@ -1354,7 +1330,8 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version); + /* prepare for reading WAL into the outbuf */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); sk->active_state = SS_ACTIVE_READ_WAL; } @@ -1367,14 +1344,17 @@ SendAppendRequests(Safekeeper *sk) req = &sk->appendRequest; req_len = req->endLsn - req->beginLsn; - /* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */ + /* + * We send zero sized AppenRequests as heartbeats; don't wal_read + * for these. + */ if (req_len > 0) { switch (wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req_len, - &errmsg)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req_len, + &errmsg)) { case NEON_WALREAD_SUCCESS: break; @@ -1382,7 +1362,7 @@ SendAppendRequests(Safekeeper *sk) return true; case NEON_WALREAD_ERROR: wp_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); + sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: @@ -1470,11 +1450,11 @@ RecvAppendResponses(Safekeeper *sk) * Term has changed to higher one, probably another compute is * running. If this is the case we could PANIC as well because * likely it inserted some data and our basebackup is unsuitable - * anymore. However, we also bump term manually (term_bump endpoint) - * on safekeepers for migration purposes, in this case we do want - * compute to stay alive. So restart walproposer with FATAL instead - * of panicking; if basebackup is spoiled next election will notice - * this. + * anymore. However, we also bump term manually (term_bump + * endpoint) on safekeepers for migration purposes, in this case + * we do want compute to stay alive. So restart walproposer with + * FATAL instead of panicking; if basebackup is spoiled next + * election will notice this. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, @@ -1509,7 +1489,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { - const char *key = pq_getmsgstring(reply_message); + const char *key = pq_getmsgrawstring(reply_message); unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) @@ -1750,6 +1730,213 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } } +/* Serialize MembershipConfiguration into buf. */ +static void +MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + pq_sendint32(buf, mconf->generation); + + pq_sendint32(buf, mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + pq_sendint64(buf, mconf->members.m[i].node_id); + pq_send_ascii_string(buf, mconf->members.m[i].host); + pq_sendint16(buf, mconf->members.m[i].port); + } + + /* + * There is no special mark for absent new_members; zero members in + * invalid, so zero len means absent. + */ + pq_sendint32(buf, mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + pq_sendint64(buf, mconf->new_members.m[i].node_id); + pq_send_ascii_string(buf, mconf->new_members.m[i].host); + pq_sendint16(buf, mconf->new_members.m[i].port); + } +} + +/* Serialize proposer -> acceptor message into buf using specified version */ +static void +PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version) +{ + /* both version are supported currently until we fully migrate to 3 */ + Assert(proto_version == 3 || proto_version == 2); + + resetStringInfo(buf); + + if (proto_version == 3) + { + /* + * v2 sends structs for some messages as is, so commonly send tag only + * for v3 + */ + pq_sendint8(buf, msg->tag); + + switch (msg->tag) + { + case 'g': + { + ProposerGreeting *m = (ProposerGreeting *) msg; + + pq_send_ascii_string(buf, m->tenant_id); + pq_send_ascii_string(buf, m->timeline_id); + MembershipConfigurationSerialize(&m->mconf, buf); + pq_sendint32(buf, m->pg_version); + pq_sendint64(buf, m->system_id); + pq_sendint32(buf, m->wal_seg_size); + break; + } + case 'v': + { + VoteRequest *m = (VoteRequest *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + break; + + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->startStreamingAt); + pq_sendint32(buf, m->termHistory->n_entries); + for (uint32 i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64(buf, m->termHistory->entries[i].term); + pq_sendint64(buf, m->termHistory->entries[i].lsn); + } + break; + } + case 'a': + { + /* + * Note: this serializes only AppendRequestHeader, caller + * is expected to append WAL data later. + */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->beginLsn); + pq_sendint64(buf, m->endLsn); + pq_sendint64(buf, m->commitLsn); + pq_sendint64(buf, m->truncateLsn); + break; + } + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + + if (proto_version == 2) + { + switch (msg->tag) + { + case 'g': + { + /* v2 sent struct as is */ + ProposerGreeting *m = (ProposerGreeting *) msg; + ProposerGreetingV2 greetRequestV2; + + /* Fill also v2 struct. */ + greetRequestV2.tag = 'g'; + greetRequestV2.protocolVersion = proto_version; + greetRequestV2.pgVersion = m->pg_version; + + /* + * v3 removed this field because it's easier to pass as + * libq or START_WAL_PUSH options + */ + memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId)); + greetRequestV2.systemId = wp->config->systemId; + if (*m->timeline_id != '\0' && + !HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16)) + wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id); + if (*m->tenant_id != '\0' && + !HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16)) + wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id); + + greetRequestV2.timeline = wp->config->pgTimeline; + greetRequestV2.walSegSize = wp->config->wal_segment_size; + + pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2)); + break; + } + case 'v': + { + /* v2 sent struct as is */ + VoteRequest *m = (VoteRequest *) msg; + VoteRequestV2 voteRequestV2; + + voteRequestV2.tag = m->pam.tag; + voteRequestV2.term = m->term; + /* removed field */ + memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId)); + pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2)); + break; + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint64_le(buf, m->apm.tag); + pq_sendint64_le(buf, m->term); + pq_sendint64_le(buf, m->startStreamingAt); + pq_sendint32_le(buf, m->termHistory->n_entries); + for (int i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64_le(buf, m->termHistory->entries[i].term); + pq_sendint64_le(buf, m->termHistory->entries[i].lsn); + } + /* + * Removed timeline_start_lsn. Still send it as a valid + * value until safekeepers taking it from term history are + * deployed. + */ + pq_sendint64_le(buf, m->termHistory->entries[0].lsn); + break; + } + case 'a': + + /* + * Note: this serializes only AppendRequestHeader, caller is + * expected to append WAL data later. + */ + { + /* v2 sent struct as is */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + AppendRequestHeaderV2 appendRequestHeaderV2; + + appendRequestHeaderV2.tag = m->apm.tag; + appendRequestHeaderV2.term = m->term; + appendRequestHeaderV2.epochStartLsn = 0; /* removed field */ + appendRequestHeaderV2.beginLsn = m->beginLsn; + appendRequestHeaderV2.endLsn = m->endLsn; + appendRequestHeaderV2.commitLsn = m->commitLsn; + appendRequestHeaderV2.truncateLsn = m->truncateLsn; + /* removed field */ + memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId)); + + pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2)); + break; + } + + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + wp_log(FATAL, "unexpected proto_version %d", proto_version); +} + /* * Try to read CopyData message from i'th safekeeper, resetting connection on * failure. @@ -1779,6 +1966,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; } +/* Deserialize membership configuration from buf to mconf. */ +static void +MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + mconf->generation = pq_getmsgint32(buf); + mconf->members.len = pq_getmsgint32(buf); + mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + const char *buf_host; + + mconf->members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host)); + mconf->members.m[i].port = pq_getmsgint16(buf); + } + mconf->new_members.len = pq_getmsgint32(buf); + mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + const char *buf_host; + + mconf->new_members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host)); + mconf->new_members.m[i].port = pq_getmsgint16(buf); + } +} + /* * Read next message with known type into provided struct, by reading a CopyData * block from the safekeeper's postgres connection, returning whether the read @@ -1787,6 +2005,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * If the read needs more polling, we return 'false' and keep the state * unmodified, waiting until it becomes read-ready to try again. If it fully * failed, a warning is emitted and the connection is reset. + * + * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields. */ static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) @@ -1795,82 +2015,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) char *buf; int buf_size; - uint64 tag; + uint8 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) return false; + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* parse it */ s.data = buf; s.len = buf_size; + s.maxlen = buf_size; s.cursor = 0; - tag = pq_getmsgint64_le(&s); - if (tag != anymsg->tag) + if (wp->config->proto_version == 3) { - wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk)); - ResetConnection(sk); - return false; - } - sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); - switch (tag) - { - case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) + tag = pq_getmsgbyte(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->nodeId = pq_getmsgint64(&s); + MembershipConfigurationDeserialize(&msg->mconf, &s); + msg->term = pq_getmsgint64(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; - case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->voteGiven = pq_getmsgbyte(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->truncateLsn = pq_getmsgint64(&s); + msg->termHistory.n_entries = pq_getmsgint32(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (uint32 i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64(&s); + } + pq_getmsgend(&s); + return true; + } + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (s.len > s.cursor) - ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); - else - msg->ps_feedback.present = false; - pq_getmsgend(&s); - return true; - } - - default: - { - Assert(false); - return false; - } + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->commitLsn = pq_getmsgint64(&s); + msg->hs.ts = pq_getmsgint64(&s); + msg->hs.xmin.value = pq_getmsgint64(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } } + else if (wp->config->proto_version == 2) + { + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + pq_getmsgint64_le(&s); /* timelineStartLsn */ + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } + } + wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); + return false; /* keep the compiler quiet */ } /* @@ -2246,3 +2538,45 @@ FormatEvents(WalProposer *wp, uint32 events) return (char *) &return_str; } + +/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */ +static char * +MembershipConfigurationToString(MembershipConfiguration *mconf) +{ + StringInfoData s; + uint32 i; + + initStringInfo(&s); + appendStringInfo(&s, "{gen = %u", mconf->generation); + appendStringInfoString(&s, ", members = ["); + for (i = 0; i < mconf->members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port); + } + appendStringInfo(&s, "], new_members = ["); + for (i = 0; i < mconf->new_members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port); + } + appendStringInfoString(&s, "]}"); + return s.data; +} + +static void +MembershipConfigurationFree(MembershipConfiguration *mconf) +{ + if (mconf->members.m) + pfree(mconf->members.m); + mconf->members.m = NULL; + if (mconf->new_members.m) + pfree(mconf->new_members.m); + mconf->new_members.m = NULL; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d8c44f8182..eee55f924f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -12,9 +12,6 @@ #include "neon_walreader.h" #include "pagestore_client.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 - #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ @@ -143,12 +140,71 @@ typedef uint64 term_t; /* neon storage node id */ typedef uint64 NNodeId; +/* + * Number uniquely identifying safekeeper membership configuration. + * This and following structs pair ones in membership.rs. + */ +typedef uint32 Generation; + +typedef struct SafekeeperId +{ + NNodeId node_id; + char host[MAXCONNINFO]; + uint16 port; +} SafekeeperId; + +/* Set of safekeepers. */ +typedef struct MemberSet +{ + uint32 len; /* number of members */ + SafekeeperId *m; /* ids themselves */ +} MemberSet; + +/* Timeline safekeeper membership configuration. */ +typedef struct MembershipConfiguration +{ + Generation generation; + MemberSet members; + /* Has 0 n_members in non joint conf. */ + MemberSet new_members; +} MembershipConfiguration; + /* * Proposer <-> Acceptor messaging. */ +typedef struct ProposerAcceptorMessage +{ + uint8 tag; +} ProposerAcceptorMessage; + /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting +{ + ProposerAcceptorMessage pam; /* message tag */ + + /* + * tenant/timeline ids as C strings with standard hex notation for ease of + * printing. In principle they are not strictly needed as ttid is also + * passed as libpq options. + */ + char *tenant_id; + char *timeline_id; + /* Full conf is carried to allow safekeeper switch */ + MembershipConfiguration mconf; + + /* + * pg_version and wal_seg_size are used for timeline creation until we + * fully migrate to doing externally. systemId is only used as a sanity + * cross check. + */ + uint32 pg_version; /* in PG_VERSION_NUM format */ + uint64 system_id; /* Postgres system identifier. */ + uint32 wal_seg_size; +} ProposerGreeting; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct ProposerGreetingV2 { uint64 tag; /* message tag */ uint32 protocolVersion; /* proposer-safekeeper protocol version */ @@ -159,32 +215,42 @@ typedef struct ProposerGreeting uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; -} ProposerGreeting; +} ProposerGreetingV2; typedef struct AcceptorProposerMessage { - uint64 tag; + uint8 tag; } AcceptorProposerMessage; /* - * Acceptor -> Proposer initial response: the highest term acceptor voted for. + * Acceptor -> Proposer initial response: the highest term acceptor voted for, + * its node id and configuration. */ typedef struct AcceptorGreeting { AcceptorProposerMessage apm; - term_t term; NNodeId nodeId; + MembershipConfiguration mconf; + term_t term; } AcceptorGreeting; /* * Proposer -> Acceptor vote request. */ typedef struct VoteRequest +{ + ProposerAcceptorMessage pam; /* message tag */ + Generation generation; /* membership conf generation */ + term_t term; +} VoteRequest; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct VoteRequestV2 { uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; +} VoteRequestV2; /* Element of term switching chain. */ typedef struct TermSwitchEntry @@ -203,8 +269,15 @@ typedef struct TermHistory typedef struct VoteResponse { AcceptorProposerMessage apm; + + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; term_t term; - uint64 voteGiven; + uint8 voteGiven; /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow @@ -214,7 +287,6 @@ typedef struct VoteResponse XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ } VoteResponse; /* @@ -223,20 +295,37 @@ typedef struct VoteResponse */ typedef struct ProposerElected { - uint64 tag; + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ term_t term; /* proposer will send since this point */ XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; - /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; } ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader +{ + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ + term_t term; /* term of the proposer */ + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + + /* + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeader; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct AppendRequestHeaderV2 { uint64 tag; term_t term; /* term of the proposer */ @@ -256,7 +345,8 @@ typedef struct AppendRequestHeader */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeaderV2; /* * Hot standby feedback received from replica @@ -309,6 +399,13 @@ typedef struct AppendResponse { AcceptorProposerMessage apm; + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. @@ -644,6 +741,8 @@ typedef struct WalProposerConfig /* Will be passed to safekeepers in greet request. */ TimeLineID pgTimeline; + int proto_version; + #ifdef WALPROPOSER_LIB void *callback_data; #endif @@ -656,11 +755,14 @@ typedef struct WalProposerConfig typedef struct WalProposer { WalProposerConfig *config; - int n_safekeepers; + /* Current walproposer membership configuration */ + MembershipConfiguration mconf; /* (n_safekeepers / 2) + 1 */ int quorum; + /* Number of occupied slots in safekeepers[] */ + int n_safekeepers; Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ @@ -670,6 +772,7 @@ typedef struct WalProposer XLogRecPtr commitLsn; ProposerGreeting greetRequest; + ProposerGreetingV2 greetRequestV2; /* Vote request for safekeeper */ VoteRequest voteRequest; diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 35d984c52e..a986160224 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen) } /* -------------------------------- - * pq_getmsgstring - get a null-terminated text string (with conversion) + * pq_getmsgrawstring - get a null-terminated text string - NO conversion * - * May return a pointer directly into the message buffer, or a pointer - * to a palloc'd conversion result. + * Returns a pointer directly into the message buffer. * -------------------------------- */ const char * -pq_getmsgstring(StringInfo msg) +pq_getmsgrawstring(StringInfo msg) { char *str; int slen; @@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg) ExceptionalCondition("invalid msg format", __FILE__, __LINE__); } +/* -------------------------------- + * pq_sendbytes - append raw data to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbytes(StringInfo buf, const void *data, int datalen) +{ + /* use variant that maintains a trailing null-byte, out of caution */ + appendBinaryStringInfo(buf, data, datalen); +} + +/* -------------------------------- + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * + * This function intentionally bypasses encoding conversion, instead just + * silently replacing any non-7-bit-ASCII characters with question marks. + * It is used only when we are having trouble sending an error message to + * the client with normal localization and encoding conversion. The caller + * should already have taken measures to ensure the string is just ASCII; + * the extra work here is just to make certain we don't send a badly encoded + * string to the client (which might or might not be robust about that). + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_send_ascii_string(StringInfo buf, const char *str) +{ + while (*str) + { + char ch = *str++; + + if (IS_HIGHBIT_SET(ch)) + ch = '?'; + appendStringInfoCharMacro(buf, ch); + } + appendStringInfoChar(buf, '\0'); +} /* * Produce a C-string representation of a TimestampTz. diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 86444084ff..b21184de57 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -59,9 +59,11 @@ #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" +/* GUCs */ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +int safekeeper_proto_version = 2; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers) else walprop_config.systemId = 0; walprop_config.pgTimeline = walprop_pg_get_timeline_id(); + walprop_config.proto_version = safekeeper_proto_version; } /* @@ -219,25 +222,37 @@ nwp_register_gucs(void) PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_proto_version", + "Version of compute <-> safekeeper protocol.", + "Used while migrating from 2 to 3.", + &safekeeper_proto_version, + 2, 0, INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); } static int split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) { - int n_safekeepers = 0; - char *curr_sk = safekeepers_list; + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) { - if (++n_safekeepers >= MAX_SAFEKEEPERS) { + if (++n_safekeepers >= MAX_SAFEKEEPERS) + { wpg_log(FATAL, "too many safekeepers"); } coma = strchr(coma, ','); - safekeepers[n_safekeepers-1] = curr_sk; + safekeepers[n_safekeepers - 1] = curr_sk; - if (coma != NULL) { + if (coma != NULL) + { *coma++ = '\0'; } } @@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) static bool safekeepers_cmp(char *old, char *new) { - char *safekeepers_old[MAX_SAFEKEEPERS]; - char *safekeepers_new[MAX_SAFEKEEPERS]; - int len_old = 0; - int len_new = 0; + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); @@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra) if (!am_walproposer) return; - if (!newval) { + if (!newval) + { /* should never happen */ wpg_log(FATAL, "neon.safekeepers is empty"); } @@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra) newval_copy = pstrdup(newval); oldval = pstrdup(wal_acceptors_list); - /* + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before - * next bgw start. We should refactor walproposer to allow graceful exit and - * thus remove this delay. - * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. + * next bgw start. We should refactor walproposer to allow graceful exit + * and thus remove this delay. XXX: If you change anything here, sync with + * test_safekeepers_reconfigure_reorder. */ if (!safekeepers_cmp(oldval, newval_copy)) { @@ -454,7 +470,8 @@ backpressure_throttling_impl(void) memcpy(new_status, old_status, len); snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); set_ps_display(new_status); - new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ + new_status[len] = '\0'; /* truncate off " backpressure ..." to later + * reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); @@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) wpg_log(LOG, "WAL proposer starts streaming at %X/%X", LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = wp->greetRequest.timeline; + cmd.timeline = wp->config->pgTimeline; cmd.startpoint = startpos; StartProposerReplication(wp, &cmd); } @@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) FullTransactionId xmin = hsFeedback.xmin; FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; FullTransactionId next_xid = ReadNextFullTransactionId(); + /* - * Page server is updating nextXid in checkpoint each 1024 transactions, - * so feedback xmin can be actually larger then nextXid and - * function TransactionIdInRecentPast return false in this case, + * Page server is updating nextXid in checkpoint each 1024 + * transactions, so feedback xmin can be actually larger then nextXid + * and function TransactionIdInRecentPast return false in this case, * preventing update of slot's xmin. */ if (FullTransactionIdPrecedes(next_xid, xmin)) diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 74cd5ac601..ff2846a9e7 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -32,8 +32,8 @@ #include "inmem_smgr.h" -/* Size of the in-memory smgr */ -#define MAX_PAGES 64 +/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, but we can update up to 3 forks for each block */ +#define MAX_PAGES 100 /* If more than WARN_PAGES are used, print a warning in the log */ #define WARN_PAGES 32 @@ -285,12 +285,12 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * WARN_PAGES, print a warning so that we get alerted and get to * investigate why we're accessing so many buffers. */ - elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, - "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - blocknum, - used_pages); + if (used_pages >= WARN_PAGES) + ereport(WARNING, (errmsg("inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + blocknum, + used_pages), errbacktrace())); if (used_pages == MAX_PAGES) elog(ERROR, "Inmem storage overflow"); diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 619b7255ae..4673de778c 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -142,7 +142,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE LOG +#define TRACE DEBUG1 #ifdef HAVE_LIBSECCOMP @@ -194,6 +194,7 @@ static PgSeccompRule allowed_syscalls[] = * is stored in MyProcPid anyway. */ PG_SCMP_ALLOW(getpid), + PG_SCMP_ALLOW(futex), /* needed for errbacktrace */ /* Enable those for a proper shutdown. */ #if 0 @@ -253,7 +254,7 @@ WalRedoMain(int argc, char *argv[]) * which is super strange but that's not something we can solve * for here. ¯\_(-_-)_/¯ */ - SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("log_min_messages", "WARNING", PGC_SUSET, PGC_S_OVERRIDE); SetConfigOption("client_min_messages", "ERROR", PGC_SUSET, PGC_S_OVERRIDE); diff --git a/pre-commit.py b/pre-commit.py index c9567e0c50..09139459d5 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -29,12 +29,12 @@ def colorify( return f"{color.value}{s}{NC}" -def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2021" +def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str: + cmd = "cargo fmt" if not fix_inplace: cmd += " --check" if no_color: - cmd += " --color=never" + cmd += " -- --color=never" return cmd @@ -61,14 +61,23 @@ def get_commit_files() -> list[str]: return files.decode().splitlines() -def check(name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False): +def check( + name: str, + suffix: str, + cmd: str, + changed_files: list[str], + no_color: bool = False, + append_files_to_cmd: bool = True, +): print(f"Checking: {name} ", end="") applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files)) if not applicable_files: print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color)) return - cmd = f'{cmd} {" ".join(applicable_files)}' + if append_files_to_cmd: + cmd = f"{cmd} {' '.join(applicable_files)}" + res = subprocess.run(cmd.split(), capture_output=True) if res.returncode != 0: print(colorify("[FAILED]", Color.RED, no_color)) @@ -100,15 +109,13 @@ if __name__ == "__main__": args = parser.parse_args() files = get_commit_files() - # we use rustfmt here because cargo fmt does not accept list of files - # it internally gathers project files and feeds them to rustfmt - # so because we want to check only files included in the commit we use rustfmt directly check( - name="rustfmt", + name="cargo fmt", suffix=".rs", - cmd=rustfmt(fix_inplace=args.fix_inplace, no_color=args.no_color), + cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color), changed_files=files, no_color=args.no_color, + append_files_to_cmd=False, ) check( name="ruff check", diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 74b48a1bea..f87f4e9ef8 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -55,6 +55,7 @@ struct RequestContextInner { dbname: Option, user: Option, application: Option, + user_agent: Option, error_kind: Option, pub(crate) auth_method: Option, jwt_issuer: Option, @@ -100,6 +101,7 @@ impl Clone for RequestContext { dbname: inner.dbname.clone(), user: inner.user.clone(), application: inner.application.clone(), + user_agent: inner.user_agent.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), @@ -149,6 +151,7 @@ impl RequestContext { dbname: None, user: None, application: None, + user_agent: None, error_kind: None, auth_method: None, jwt_issuer: None, @@ -245,6 +248,13 @@ impl RequestContext { .set_user(user); } + pub(crate) fn set_user_agent(&self, user_agent: Option) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user_agent(user_agent); + } + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); @@ -384,6 +394,10 @@ impl RequestContextInner { } } + fn set_user_agent(&mut self, user_agent: Option) { + self.user_agent = user_agent; + } + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index f029327266..bfab5f34f9 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -82,6 +82,7 @@ pub(crate) struct RequestData { peer_addr: String, username: Option, application_name: Option, + user_agent: Option, endpoint_id: Option, database: Option, project: Option, @@ -128,6 +129,7 @@ impl From<&RequestContextInner> for RequestData { timestamp: value.first_packet.naive_utc(), username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), + user_agent: value.user_agent.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), @@ -522,6 +524,7 @@ mod tests { .unwrap() .naive_utc(), application_name: Some("test".to_owned()), + user_agent: Some("test-user-agent".to_owned()), username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), @@ -610,15 +613,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -650,11 +653,11 @@ mod tests { assert_eq!( file_stats, [ - (1204324, 5, 10000), - (1204048, 5, 10000), - (1204349, 5, 10000), - (1204334, 5, 10000), - (1204588, 5, 10000) + (1205810, 5, 10000), + (1205534, 5, 10000), + (1205835, 5, 10000), + (1205820, 5, 10000), + (1206074, 5, 10000) ] ); @@ -679,15 +682,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -724,7 +727,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] + [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index dd0fb9c5b4..acd6a05718 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -438,6 +438,14 @@ async fn request_handler( &config.region, ); + ctx.set_user_agent( + request + .headers() + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8babfb5cd2..93dd531f70 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -228,6 +228,13 @@ fn get_conn_info( } } + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let user_info = ComputeUserInfo { endpoint, user: username, diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index c86ac576ad..bb937ad56a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 19c6662e74..122630d953 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -4,7 +4,7 @@ use std::io::Write as _; use bytes::BytesMut; use camino_tempfile::tempfile; -use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; +use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use pprof::criterion::{Output, PProfProfiler}; @@ -13,6 +13,7 @@ use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; use safekeeper::test_utils::Env; +use safekeeper_api::membership::SafekeeperGeneration as Generation; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -26,7 +27,7 @@ const GB: usize = 1024 * MB; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. @@ -88,13 +89,12 @@ fn bench_process_msg(c: &mut Criterion) { let (lsn, record) = walgen.next().expect("endless WAL"); ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -160,13 +160,12 @@ fn bench_wal_acceptor(c: &mut Criterion) { .take(n) .map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -262,13 +261,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }); diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5c305769dd..0e92e87103 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -3,17 +3,16 @@ //! Partially copied from pageserver client; some parts might be better to be //! united. +use std::error::Error as _; + use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{ PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; -use std::error::Error as _; -use utils::{ - id::{NodeId, TenantId, TimelineId}, - logging::SecretString, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; #[derive(Debug, Clone)] pub struct Client { diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6cc53e0d23..10fc4a4b59 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,52 +1,41 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use clap::{ArgAction, Parser}; -use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use futures::{FutureExt, StreamExt}; -use remote_storage::RemoteStorageConfig; -use sd_notify::NotifyState; -use tokio::runtime::Handle; -use tokio::signal::unix::{signal, SignalKind}; -use tokio::task::JoinError; -use utils::logging::SecretString; - -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; -use storage_broker::Uri; - -use tracing::*; -use utils::pid_file; +use anyhow::{Context, Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::{ArgAction, Parser}; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; +use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::http; -use safekeeper::wal_service; -use safekeeper::GlobalTimelines; -use safekeeper::SafeKeeperConf; -use safekeeper::{broker, WAL_SERVICE_RUNTIME}; -use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{wal_backup, HTTP_RUNTIME}; -use storage_broker::DEFAULT_ENDPOINT; -use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; -use utils::{ - id::NodeId, - logging::{self, LogFormat}, - project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, +use safekeeper::{ + BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, + control_file, http, wal_backup, wal_service, }; +use sd_notify::NotifyState; +use storage_broker::{DEFAULT_ENDPOINT, Uri}; +use tokio::runtime::Handle; +use tokio::signal::unix::{SignalKind, signal}; +use tokio::task::JoinError; +use tracing::*; +use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; +use utils::id::NodeId; +use utils::logging::{self, LogFormat, SecretString}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -55,7 +44,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 4b091e2c29..de6e275124 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,39 +1,25 @@ //! Communication with the broker, providing safekeeper peers and pageserver coordination. -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; - -use anyhow::Error; -use anyhow::Result; - -use storage_broker::parse_proto_ttid; - -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::FilterTenantTimelineId; -use storage_broker::proto::MessageType; -use storage_broker::proto::SafekeeperDiscoveryResponse; -use storage_broker::proto::SubscribeByFilterRequest; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; -use storage_broker::proto::TypeSubscription; -use storage_broker::proto::TypedMessage; -use storage_broker::Request; - -use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; -use std::time::UNIX_EPOCH; +use std::sync::atomic::AtomicU64; +use std::time::{Duration, Instant, UNIX_EPOCH}; + +use anyhow::{Context, Error, Result, anyhow, bail}; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, + SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage, +}; +use storage_broker::{Request, parse_proto_ttid}; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; -use crate::metrics::BROKER_ITERATION_TIMELINES; -use crate::metrics::BROKER_PULLED_UPDATES; -use crate::metrics::BROKER_PUSHED_UPDATES; -use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::metrics::{ + BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS, + BROKER_PUSHED_UPDATES, +}; +use crate::{GlobalTimelines, SafeKeeperConf}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 35aebfd8ad..1bf3e4cac1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -1,24 +1,23 @@ //! Control file serialization, deserialization and persistence. -use anyhow::{bail, ensure, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::{Utf8Path, Utf8PathBuf}; -use safekeeper_api::membership::INVALID_GENERATION; -use tokio::fs::File; -use tokio::io::AsyncWriteExt; -use utils::crashsafe::durable_rename; - use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v10_to_v9; -use crate::control_file_upgrade::upgrade_control_file; +use anyhow::{Context, Result, bail, ensure}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; +use tokio::fs::File; +use tokio::io::AsyncWriteExt; +use utils::bin_ser::LeSer; +use utils::crashsafe::durable_rename; + +use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file}; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; -use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_FORMAT_VERSION: u32 = 10; @@ -234,11 +233,12 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::*; use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; + use super::*; + const NO_SYNC: bool = true; #[tokio::test] diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 904e79f976..1ad9e62f9b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,24 +1,19 @@ //! Code to deal with safekeeper control file upgrades use std::vec; -use crate::{ - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, - state::{EvictionState, TimelinePersistentState}, - wal_backup_partial, -}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use pq_proto::SystemId; -use safekeeper_api::{ - membership::{Configuration, INVALID_GENERATION}, - ServerInfo, Term, -}; +use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::wal_backup_partial; /// Persistent consensus state of the acceptor. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -552,11 +547,11 @@ pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersisten mod tests { use std::str::FromStr; - use utils::{id::NodeId, Hex}; - - use crate::control_file_upgrade::PersistedPeerInfo; + use utils::Hex; + use utils::id::NodeId; use super::*; + use crate::control_file_upgrade::PersistedPeerInfo; #[test] fn roundtrip_v1() { diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 10a761e1f5..11daff22cb 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,24 +1,22 @@ -use anyhow::{bail, Result}; +use std::sync::Arc; + +use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; use safekeeper_api::membership::Configuration; -use std::sync::Arc; -use tokio::{ - fs::OpenOptions, - io::{AsyncSeekExt, AsyncWriteExt}, -}; +use tokio::fs::OpenOptions; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tracing::{info, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::{ - control_file::FileStorage, - state::TimelinePersistentState, - timeline::{TimelineError, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup::copy_s3_segments, - wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::control_file::FileStorage; +use crate::state::TimelinePersistentState; +use crate::timeline::{TimelineError, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_backup::copy_s3_segments; +use crate::wal_storage::{WalReader, wal_file_paths}; // we don't want to have more than 10 segments on disk after copy, because they take space const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 19362a0992..68a38e1498 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -2,37 +2,25 @@ use std::fs; use std::fs::DirEntry; -use std::io::BufReader; -use std::io::Read; +use std::io::{BufReader, Read}; use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; -use anyhow::Result; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use anyhow::{Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::{DateTime, Utc}; -use postgres_ffi::XLogSegNo; -use postgres_ffi::MAX_SEND_SIZE; -use safekeeper_api::models::WalSenderState; -use serde::Deserialize; -use serde::Serialize; - use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName}; +use postgres_ffi::{MAX_SEND_SIZE, XLogSegNo}; +use safekeeper_api::models::WalSenderState; +use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use utils::id::NodeId; -use utils::id::TenantTimelineId; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::state::TimelineMemState; -use crate::state::TimelinePersistentState; -use crate::timeline::get_timeline_dir; -use crate::timeline::WalResidentTimeline; -use crate::timeline_manager; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timeline::{WalResidentTimeline, get_timeline_dir}; +use crate::{GlobalTimelines, SafeKeeperConf, timeline_manager}; /// Various filters that influence the resulting JSON output. #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index e77eeb4130..5ca3d1b7c2 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,35 +1,31 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use std::future::Future; +use std::str::{self, FromStr}; +use std::sync::Arc; + use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use safekeeper_api::models::ConnectionId; +use postgres_backend::{PostgresBackend, QueryError}; +use postgres_ffi::PG_TLI; +use pq_proto::{BeMessage, FeStartupPacket, INT4_OID, RowDescriptor, TEXT_OID}; +use regex::Regex; use safekeeper_api::Term; -use std::future::Future; -use std::str::{self, FromStr}; -use std::sync::Arc; +use safekeeper_api::models::ConnectionId; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{debug, info, info_span, Instrument}; +use tracing::{Instrument, debug, info, info_span}; +use utils::auth::{Claims, JwtAuth, Scope}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; -use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; - -use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; +use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics}; use crate::timeline::TimelineError; use crate::{GlobalTimelines, SafeKeeperConf}; -use postgres_backend::PostgresBackend; -use postgres_backend::QueryError; -use postgres_ffi::PG_TLI; -use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use regex::Regex; -use utils::auth::{Claims, JwtAuth, Scope}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { @@ -65,9 +61,6 @@ enum SafekeeperPostgresCommand { }, IdentifySystem, TimelineStatus, - JSONCtrl { - cmd: AppendLogicalMessage, - }, } fn parse_cmd(cmd: &str) -> anyhow::Result { @@ -137,11 +130,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::IdentifySystem) } else if cmd.starts_with("TIMELINE_STATUS") { Ok(SafekeeperPostgresCommand::TimelineStatus) - } else if cmd.starts_with("JSON_CTRL") { - let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; - Ok(SafekeeperPostgresCommand::JSONCtrl { - cmd: serde_json::from_str(cmd)?, - }) } else { anyhow::bail!("unsupported command {cmd}"); } @@ -153,7 +141,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", - SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", } } @@ -362,9 +349,6 @@ impl postgres_backend::Handler } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } } }) } diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 6e160b7a5e..f162985ef7 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,9 +1,9 @@ pub mod routes; -pub use routes::make_router; - -pub use safekeeper_api::models; use std::sync::Arc; +pub use routes::make_router; +pub use safekeeper_api::models; + use crate::{GlobalTimelines, SafeKeeperConf}; pub async fn task_main( diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index cd2ac5f44c..4f47331c85 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,51 +1,41 @@ -use http_utils::failpoints::failpoints_handler; -use hyper::{Body, Request, Response, StatusCode}; -use safekeeper_api::models; -use safekeeper_api::models::AcceptorStateStatus; -use safekeeper_api::models::PullTimelineRequest; -use safekeeper_api::models::SafekeeperStatus; -use safekeeper_api::models::TermSwitchApiEntry; -use safekeeper_api::models::TimelineStatus; -use safekeeper_api::ServerInfo; use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; + +use http_utils::endpoint::{ + self, ChannelWriter, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; +use hyper::{Body, Request, Response, StatusCode}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::{ + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, + TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, +}; +use safekeeper_api::{ServerInfo, membership, models}; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::sync::mpsc; use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; - -use http_utils::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, ChannelWriter}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, -}; - -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; -use utils::{ - auth::SwappableJwtAuth, - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use tracing::{Instrument, info_span}; +use utils::auth::SwappableJwtAuth; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; -use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; +use crate::timelines_global_map::{DeleteOrExclude, TimelineDeleteResult}; +use crate::{ + GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, +}; /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { @@ -83,10 +73,13 @@ async fn tenant_delete_handler(mut request: Request) -> Result) -> Result>(), + .collect::>(), ) } @@ -218,12 +211,15 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result for ApiError { + fn from(de: DeleteOrExcludeError) -> ApiError { + match de { + DeleteOrExcludeError::Conflict { + requested: _, + current: _, + } => ApiError::Conflict(de.to_string()), + DeleteOrExcludeError::Other(e) => ApiError::InternalServerError(e), + } + } +} + +/// Remove timeline locally after this node has been excluded from the +/// membership configuration. The body is the same as in the membership endpoint +/// -- conf where node is excluded -- and in principle single ep could be used +/// for both actions, but since this is a data deletion op let's keep them +/// separate. +async fn timeline_exclude_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request doesn't exclude us, membership switch endpoint should be used + // instead. + if data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is member of it", + data.mconf, my_id + ))); + } + let action = DeleteOrExclude::Exclude(data.mconf); + + let resp = global_timelines + .delete_or_exclude(&ttid, action) + .await + .map_err(ApiError::from)?; + json_response(StatusCode::OK, resp) +} + /// Consider switching timeline membership configuration to the provided one. async fn timeline_membership_handler( mut request: Request, @@ -291,12 +345,29 @@ async fn timeline_membership_handler( let tli = global_timelines.get(ttid).map_err(ApiError::from)?; let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request excludes us, exclude endpoint should be used instead. + if !data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is not a member of it", + data.mconf, my_id + ))); + } + let req_gen = data.mconf.generation; let response = tli .membership_switch(data.mconf) .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + // Return 409 if request was ignored. + if req_gen == response.current_conf.generation { + json_response(StatusCode::OK, response) + } else { + Err(ApiError::Conflict(format!( + "request to switch into {} ignored, current generation {}", + req_gen, response.current_conf.generation + ))) + } } async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { @@ -647,11 +718,14 @@ pub fn make_router( .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/exclude", |r| { + request_span(r, timeline_exclude_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) - .post( + .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", |r| request_span(r, timeline_membership_handler), ) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs deleted file mode 100644 index 19e17c4a75..0000000000 --- a/safekeeper/src/json_ctrl.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! -//! This module implements JSON_CTRL protocol, which allows exchange -//! JSON messages over psql for testing purposes. -//! -//! Currently supports AppendLogicalMessage, which is used for WAL -//! modifications in tests. -//! - -use anyhow::Context; -use postgres_backend::QueryError; -use safekeeper_api::membership::Configuration; -use safekeeper_api::{ServerInfo, Term}; -use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::*; - -use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; -use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, -}; -use crate::safekeeper::{TermHistory, TermLsn}; -use crate::state::TimelinePersistentState; -use crate::timeline::WalResidentTimeline; -use postgres_backend::PostgresBackend; -use postgres_ffi::encode_logical_message; -use postgres_ffi::WAL_SEGMENT_SIZE; -use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use utils::lsn::Lsn; - -#[derive(Serialize, Deserialize, Debug)] -pub struct AppendLogicalMessage { - // prefix and message to build LogicalMessage - pub lm_prefix: String, - pub lm_message: String, - - // if true, commit_lsn will match flush_lsn after append - pub set_commit_lsn: bool, - - // if true, ProposerElected will be sent before append - pub send_proposer_elected: bool, - - // fields from AppendRequestHeader - pub term: Term, - #[serde(with = "utils::lsn::serde_as_u64")] - pub epoch_start_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub begin_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub truncate_lsn: Lsn, - pub pg_version: u32, -} - -#[derive(Debug, Serialize)] -struct AppendResult { - // safekeeper state after append - state: TimelinePersistentState, - // info about new record in the WAL - inserted_wal: InsertedWAL, -} - -/// Handles command to craft logical message WAL record with given -/// content, and then append it with specified term and lsn. This -/// function is used to test safekeepers in different scenarios. -pub async fn handle_json_ctrl( - spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - append_request: &AppendLogicalMessage, -) -> Result<(), QueryError> { - info!("JSON_CTRL request: {append_request:?}"); - - // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg, append_request.pg_version).await?; - - // if send_proposer_elected is true, we need to update local history - if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; - } - - let inserted_wal = append_logical_message(&tli, append_request).await?; - let response = AppendResult { - state: tli.get_state().await.1, - inserted_wal, - }; - let response_data = serde_json::to_vec(&response) - .with_context(|| format!("Response {response:?} is not a json array"))?; - - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { - name: b"json", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }]))? - .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?; - Ok(()) -} - -/// Prepare safekeeper to process append requests without crashes, -/// by sending ProposerGreeting with default server.wal_seg_size. -async fn prepare_safekeeper( - spg: &SafekeeperPostgresHandler, - pg_version: u32, -) -> anyhow::Result { - let tli = spg - .global_timelines - .create( - spg.ttid, - Configuration::empty(), - ServerInfo { - pg_version, - wal_seg_size: WAL_SEGMENT_SIZE as u32, - system_id: 0, - }, - Lsn::INVALID, - Lsn::INVALID, - ) - .await?; - - tli.wal_residence_guard().await -} - -async fn send_proposer_elected( - tli: &WalResidentTimeline, - term: Term, - lsn: Lsn, -) -> anyhow::Result<()> { - // add new term to existing history - let history = tli.get_state().await.1.acceptor_state.term_history; - let history = history.up_to(lsn.checked_sub(1u64).unwrap()); - let mut history_entries = history.0; - history_entries.push(TermLsn { term, lsn }); - let history = TermHistory(history_entries); - - let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { - term, - start_streaming_at: lsn, - term_history: history, - timeline_start_lsn: lsn, - }); - - tli.process_msg(&proposer_elected_request).await?; - Ok(()) -} - -#[derive(Debug, Serialize)] -pub struct InsertedWAL { - begin_lsn: Lsn, - pub end_lsn: Lsn, - append_response: AppendResponse, -} - -/// Extend local WAL with new LogicalMessage record. To do that, -/// create AppendRequest with new WAL and pass it to safekeeper. -pub async fn append_logical_message( - tli: &WalResidentTimeline, - msg: &AppendLogicalMessage, -) -> anyhow::Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().await.1; - - let begin_lsn = msg.begin_lsn; - let end_lsn = begin_lsn + wal_data.len() as u64; - - let commit_lsn = if msg.set_commit_lsn { - end_lsn - } else { - sk_state.commit_lsn - }; - - let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { - h: AppendRequestHeader { - term: msg.term, - term_start_lsn: begin_lsn, - begin_lsn, - end_lsn, - commit_lsn, - truncate_lsn: msg.truncate_lsn, - proposer_uuid: [0u8; 16], - }, - wal_data, - }); - - let response = tli.process_msg(&append_request).await?; - - let append_response = match response { - Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, - _ => anyhow::bail!("not AppendResponse"), - }; - - Ok(InsertedWAL { - begin_lsn, - end_lsn, - append_response, - }) -} diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index e0090c638a..de3b783508 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -2,15 +2,16 @@ extern crate hyper0 as hyper; +use std::time::Duration; + use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; -use tokio::runtime::Runtime; - -use std::time::Duration; use storage_broker::Uri; - -use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; +use tokio::runtime::Runtime; +use utils::auth::SwappableJwtAuth; +use utils::id::NodeId; +use utils::logging::SecretString; mod auth; pub mod broker; @@ -20,7 +21,6 @@ pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; -pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; @@ -48,6 +48,7 @@ pub mod test_utils; mod timelines_global_map; use std::sync::Arc; + pub use timelines_global_map::GlobalTimelines; use utils::auth::JwtAuth; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3ea9e3d674..cb21a5f6d2 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,30 +1,28 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::{ - sync::{Arc, RwLock}, - time::{Instant, SystemTime}, -}; +use std::sync::{Arc, RwLock}; +use std::time::{Instant, SystemTime}; use anyhow::Result; use futures::Future; +use metrics::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}; +use metrics::proto::MetricFamily; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, - pow2_buckets, - proto::MetricFamily, + DISK_FSYNC_SECONDS_BUCKETS, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, pow2_buckets, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, + register_int_gauge_vec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - state::{TimelineMemState, TimelinePersistentState}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::receive_wal::MSG_QUEUE_SIZE; +use crate::state::{TimelineMemState, TimelinePersistentState}; // Global metrics across all timelines. pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs index 2136d1b5f7..efdbd9b3d7 100644 --- a/safekeeper/src/patch_control_file.rs +++ b/safekeeper/src/patch_control_file.rs @@ -4,7 +4,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use tracing::info; -use crate::{state::TimelinePersistentState, timeline::Timeline}; +use crate::state::TimelinePersistentState; +use crate::timeline::Timeline; #[derive(Deserialize, Debug, Clone)] pub struct Request { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 4827b73074..fc58b8509a 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,46 +1,38 @@ -use anyhow::{anyhow, bail, Context, Result}; +use std::cmp::min; +use std::io::{self, ErrorKind}; +use std::sync::Arc; + +use anyhow::{Context, Result, anyhow, bail}; use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{ - models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, - Term, -}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use safekeeper_api::Term; +use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; use serde::Deserialize; -use std::{ - cmp::min, - io::{self, ErrorKind}, - sync::Arc, -}; -use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio::fs::OpenOptions; +use tokio::io::AsyncWrite; +use tokio::sync::mpsc; +use tokio::task; use tokio_tar::{Archive, Builder, Header}; -use tokio_util::{ - io::{CopyToBytes, SinkWriter}, - sync::PollSender, -}; +use tokio_util::io::{CopyToBytes, SinkWriter}; +use tokio_util::sync::PollSender; use tracing::{error, info, instrument}; +use utils::crashsafe::fsync_async_opt; +use utils::id::{NodeId, TenantTimelineId}; +use utils::logging::SecretString; +use utils::lsn::Lsn; +use utils::pausable_failpoint; -use crate::{ - control_file::CONTROL_FILE_NAME, - debug_dump, - state::{EvictionState, TimelinePersistentState}, - timeline::{Timeline, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup, - wal_storage::open_wal_file, - GlobalTimelines, -}; -use utils::{ - crashsafe::fsync_async_opt, - id::{NodeId, TenantTimelineId}, - logging::SecretString, - lsn::Lsn, - pausable_failpoint, -}; +use crate::control_file::CONTROL_FILE_NAME; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_storage::open_wal_file; +use crate::{GlobalTimelines, debug_dump, wal_backup}; /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] @@ -374,8 +366,13 @@ impl WalResidentTimeline { // change, but as long as older history is strictly part of new that's // fine), but there is no need to do it. if bctx.term != term || bctx.last_log_term != last_log_term { - bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", - bctx.term, bctx.last_log_term, term, last_log_term); + bail!( + "term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, + bctx.last_log_term, + term, + last_log_term + ); } Ok(()) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index cb42f6f414..7967acde3f 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,35 +2,21 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::{ - WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, - WAL_RECEIVER_QUEUE_SIZE_TOTAL, -}; -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; -use crate::timeline::WalResidentTimeline; -use crate::GlobalTimelines; -use anyhow::{anyhow, Context}; -use bytes::BytesMut; -use parking_lot::MappedMutexGuard; -use parking_lot::Mutex; -use parking_lot::MutexGuard; -use postgres_backend::CopyStreamHandlerEnd; -use postgres_backend::PostgresBackend; -use postgres_backend::PostgresBackendReader; -use postgres_backend::QueryError; -use pq_proto::BeMessage; -use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; -use safekeeper_api::ServerInfo; use std::future; use std::net::SocketAddr; use std::sync::Arc; -use tokio::io::AsyncRead; -use tokio::io::AsyncWrite; + +use anyhow::{Context, anyhow}; +use bytes::BytesMut; +use parking_lot::{MappedMutexGuard, Mutex, MutexGuard}; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use pq_proto::BeMessage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendTimeoutError; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tokio::sync::mpsc::{Receiver, Sender, channel}; use tokio::task; use tokio::task::JoinHandle; use tokio::time::{Duration, Instant, MissedTickBehavior}; @@ -39,6 +25,15 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use crate::GlobalTimelines; +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::{ + WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, WAL_RECEIVER_QUEUE_SIZE_TOTAL, + WAL_RECEIVERS, +}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; +use crate::timeline::WalResidentTimeline; + const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped @@ -281,7 +276,7 @@ impl SafekeeperPostgresHandler { tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, - r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r, _ = timeline_cancel.cancelled() => { return Err(CopyStreamHandlerEnd::Cancelled); } @@ -342,8 +337,8 @@ impl NetworkReader<'_, IO> { let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with walproposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, + "start handshake with walproposer {} sysid {}", + self.peer_addr, greeting.system_id, ); let server_info = ServerInfo { pg_version: greeting.pg_version, @@ -371,7 +366,7 @@ impl NetworkReader<'_, IO> { _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( "unexpected message {next_msg:?} instead of greeting" - ))) + ))); } }; Ok((tli, next_msg)) @@ -459,6 +454,7 @@ async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); @@ -496,7 +492,7 @@ async fn network_write( }; buf.clear(); - msg.serialize(&mut buf)?; + msg.serialize(&mut buf, proto_version)?; pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 35394eb6ed..c2760792b8 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -1,39 +1,36 @@ //! This module implements pulling WAL from peer safekeepers if compute can't //! provide it, i.e. safekeeper lags too much. +use std::fmt; +use std::pin::pin; use std::time::SystemTime; -use std::{fmt, pin::pin}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; -use safekeeper_api::models::{PeerInfo, TimelineStatus}; use safekeeper_api::Term; -use tokio::sync::mpsc::{channel, Receiver, Sender}; -use tokio::time::timeout; -use tokio::{ - select, - time::sleep, - time::{self, Duration}, -}; +use safekeeper_api::membership::INVALID_GENERATION; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use tokio::select; +use tokio::sync::mpsc::{Receiver, Sender, channel}; +use tokio::time::{self, Duration, sleep, timeout}; use tokio_postgres::replication::ReplicationStream; use tokio_postgres::types::PgLsn; use tracing::*; -use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol}; -use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}; - -use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; -use crate::safekeeper::{AppendRequest, AppendRequestHeader}; -use crate::timeline::WalResidentTimeline; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, - VoteRequest, - }, - SafeKeeperConf, +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, }; +use crate::SafeKeeperConf; +use crate::receive_wal::{MSG_QUEUE_SIZE, REPLY_QUEUE_SIZE, WalAcceptor}; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, TermHistory, TermLsn, VoteRequest, +}; +use crate::timeline::WalResidentTimeline; + /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] @@ -267,7 +264,10 @@ async fn recover( ); // Now understand our term history. - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: INVALID_GENERATION, + term: donor.term, + }); let vote_response = match tli .process_msg(&vote_request) .await @@ -302,10 +302,10 @@ async fn recover( // truncate WAL locally let pe = ProposerAcceptorMessage::Elected(ProposerElected { + generation: INVALID_GENERATION, term: donor.term, start_streaming_at: last_common_point.lsn, term_history: donor_th, - timeline_start_lsn: Lsn::INVALID, }); // Successful ProposerElected handling always returns None. If term changed, // we'll find out that during the streaming. Note: it is expected to get @@ -351,7 +351,9 @@ async fn recovery_stream( { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { - bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open"); + bail!( + "timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open" + ); } }; trace!("connected to {:?}", donor); @@ -437,13 +439,12 @@ async fn network_io( match msg { ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { + generation: INVALID_GENERATION, term: donor.term, - term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it truncate_lsn: Lsn::INVALID, // do not attempt to advance - proposer_uuid: [0; 16], }; let ar = AppendRequest { h: ar_hdr, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index f816f8459a..886cac869d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1,35 +1,34 @@ //! Acceptor part of proposer-acceptor consensus algorithm. -use anyhow::{bail, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; -use safekeeper_api::models::HotStandbyFeedback; -use safekeeper_api::Term; -use serde::{Deserialize, Serialize}; -use std::cmp::max; -use std::cmp::min; +use std::cmp::{max, min}; use std::fmt; use std::io::Read; -use storage_broker::proto::SafekeeperTimelineInfo; +use std::str::FromStr; -use tracing::*; - -use crate::control_file; -use crate::metrics::MISC_OPERATION_SECONDS; - -use crate::state::TimelineState; -use crate::wal_storage; +use anyhow::{Context, Result, bail}; +use byteorder::{LittleEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; use pq_proto::SystemId; -use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, +use safekeeper_api::membership::{ + INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, }; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::{Term, membership}; +use serde::{Deserialize, Serialize}; +use storage_broker::proto::SafekeeperTimelineInfo; +use tracing::*; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -pub const SK_PROTOCOL_VERSION: u32 = 2; +use crate::metrics::MISC_OPERATION_SECONDS; +use crate::state::TimelineState; +use crate::{control_file, wal_storage}; + +pub const SK_PROTO_VERSION_2: u32 = 2; +pub const SK_PROTO_VERSION_3: u32 = 3; pub const UNKNOWN_SERVER_VERSION: u32 = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] @@ -56,8 +55,28 @@ impl TermHistory { TermHistory(Vec::new()) } - // Parse TermHistory as n_entries followed by TermLsn pairs + // Parse TermHistory as n_entries followed by TermLsn pairs in network order. pub fn from_bytes(bytes: &mut Bytes) -> Result { + let n_entries = bytes + .get_u32_f() + .with_context(|| "TermHistory misses len")?; + let mut res = Vec::with_capacity(n_entries as usize); + for i in 0..n_entries { + let term = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses term", i))?; + let lsn = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .into(); + res.push(TermLsn { term, lsn }) + } + Ok(TermHistory(res)) + } + + // Parse TermHistory as n_entries followed by TermLsn pairs in LE order. + // TODO remove once v2 protocol is fully dropped. + pub fn from_bytes_le(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -197,6 +216,18 @@ impl AcceptorState { /// Initial Proposer -> Acceptor message #[derive(Debug, Deserialize)] pub struct ProposerGreeting { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mconf: membership::Configuration, + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version @@ -213,39 +244,47 @@ pub struct ProposerGreeting { /// (acceptor voted for). #[derive(Debug, Serialize)] pub struct AcceptorGreeting { - term: u64, node_id: NodeId, + mconf: membership::Configuration, + term: u64, } /// Vote request sent from proposer to safekeepers -#[derive(Debug, Deserialize)] +#[derive(Debug)] pub struct VoteRequest { + pub generation: Generation, + pub term: Term, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct VoteRequestV2 { pub term: Term, } /// Vote itself, sent from safekeeper to proposer #[derive(Debug, Serialize)] pub struct VoteResponse { + generation: Generation, // membership conf generation pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - vote_given: u64, // fixme u64 due to padding + vote_given: bool, // Safekeeper flush_lsn (end of WAL) + history of term switches allow // proposer to choose the most advanced one. pub flush_lsn: Lsn, truncate_lsn: Lsn, pub term_history: TermHistory, - timeline_start_lsn: Lsn, } /* * Proposer -> Acceptor message announcing proposer is elected and communicating * term history to it. */ -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ProposerElected { + pub generation: Generation, // membership conf generation pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, - pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -257,6 +296,22 @@ pub struct AppendRequest { } #[derive(Debug, Clone, Deserialize)] pub struct AppendRequestHeader { + pub generation: Generation, // membership conf generation + // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + pub term: Term, + /// start position of message in WAL + pub begin_lsn: Lsn, + /// end position of message in WAL + pub end_lsn: Lsn, + /// LSN committed by quorum of safekeepers + pub commit_lsn: Lsn, + /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper + pub truncate_lsn: Lsn, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Clone, Deserialize)] +pub struct AppendRequestHeaderV2 { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, // TODO: remove this field from the protocol, it in unused -- LSN of term @@ -277,6 +332,9 @@ pub struct AppendRequestHeader { /// Report safekeeper state to proposer #[derive(Debug, Serialize, Clone)] pub struct AppendResponse { + // Membership conf generation. Not strictly required because on mismatch + // connection is reset, but let's sanity check it. + generation: Generation, // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, @@ -293,8 +351,9 @@ pub struct AppendResponse { } impl AppendResponse { - fn term_only(term: Term) -> AppendResponse { + fn term_only(generation: Generation, term: Term) -> AppendResponse { AppendResponse { + generation, term, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -315,72 +374,322 @@ pub enum ProposerAcceptorMessage { FlushWAL, } -impl ProposerAcceptorMessage { - /// Parse proposer message. - pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result { - if proto_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - proto_version, - SK_PROTOCOL_VERSION - ); +/// Augment Bytes with fallible get_uN where N is number of bytes methods. +/// All reads are in network (big endian) order. +trait BytesF { + fn get_u8_f(&mut self) -> Result; + fn get_u16_f(&mut self) -> Result; + fn get_u32_f(&mut self) -> Result; + fn get_u64_f(&mut self) -> Result; +} + +impl BytesF for Bytes { + fn get_u8_f(&mut self) -> Result { + if self.is_empty() { + bail!("no bytes left, expected 1"); } - // xxx using Reader is inefficient but easy to work with bincode - let mut stream = msg_bytes.reader(); - // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is - let tag = stream.read_u64::()? as u8 as char; - match tag { - 'g' => { - let msg = ProposerGreeting::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::Greeting(msg)) - } - 'v' => { - let msg = VoteRequest::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::VoteRequest(msg)) - } - 'e' => { - let mut msg_bytes = stream.into_inner(); - if msg_bytes.remaining() < 16 { - bail!("ProposerElected message is not complete"); - } - let term = msg_bytes.get_u64_le(); - let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(&mut msg_bytes)?; - if msg_bytes.remaining() < 8 { - bail!("ProposerElected message is not complete"); - } - let timeline_start_lsn = msg_bytes.get_u64_le().into(); - let msg = ProposerElected { - term, - start_streaming_at, - timeline_start_lsn, - term_history, + Ok(self.get_u8()) + } + fn get_u16_f(&mut self) -> Result { + if self.remaining() < 2 { + bail!("no bytes left, expected 2"); + } + Ok(self.get_u16()) + } + fn get_u32_f(&mut self) -> Result { + if self.remaining() < 4 { + bail!("only {} bytes left, expected 4", self.remaining()); + } + Ok(self.get_u32()) + } + fn get_u64_f(&mut self) -> Result { + if self.remaining() < 8 { + bail!("only {} bytes left, expected 8", self.remaining()); + } + Ok(self.get_u64()) + } +} + +impl ProposerAcceptorMessage { + /// Read cstring from Bytes. + fn get_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?; + let result = buf.split_to(pos); + buf.advance(1); // drop the null terminator + match std::str::from_utf8(&result) { + Ok(s) => Ok(s.to_string()), + Err(e) => bail!("invalid utf8 in cstring: {}", e), + } + } + + /// Read membership::Configuration from Bytes. + fn get_mconf(buf: &mut Bytes) -> Result { + let generation = Generation::new(buf.get_u32_f().with_context(|| "reading generation")?); + let members_len = buf.get_u32_f().with_context(|| "reading members_len")?; + // Main member set must have at least someone in valid configuration. + // Empty conf is allowed until we fully migrate. + if generation != INVALID_GENERATION && members_len == 0 { + bail!("empty members_len"); + } + let mut members = MemberSet::empty(); + for i in 0..members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading member {} node_id", i))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, + }; + members.add(sk)?; + } + let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?; + // Non joint conf. + if new_members_len == 0 { + Ok(membership::Configuration { + generation, + members, + new_members: None, + }) + } else { + let mut new_members = MemberSet::empty(); + for i in 0..new_members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading new member {} node_id", i))?; + let host = Self::get_cstr(buf) + .with_context(|| format!("reading new member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading new member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, }; - Ok(ProposerAcceptorMessage::Elected(msg)) + new_members.add(sk)?; } - 'a' => { - // read header followed by wal data - let hdr = AppendRequestHeader::des_from(&mut stream)?; - let rec_size = hdr - .end_lsn - .checked_sub(hdr.begin_lsn) - .context("begin_lsn > end_lsn in AppendRequest")? - .0 as usize; - if rec_size > MAX_SEND_SIZE { - bail!( - "AppendRequest is longer than MAX_SEND_SIZE ({})", - MAX_SEND_SIZE - ); + Ok(membership::Configuration { + generation, + members, + new_members: Some(new_members), + }) + } + } + + /// Parse proposer message. + pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version == SK_PROTO_VERSION_3 { + if msg_bytes.is_empty() { + bail!("ProposerAcceptorMessage is not complete: missing tag"); + } + let tag = msg_bytes.get_u8_f().with_context(|| { + "ProposerAcceptorMessage is not complete: missing tag".to_string() + })? as char; + match tag { + 'g' => { + let tenant_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?; + let tenant_id = TenantId::from_str(&tenant_id_str)?; + let timeline_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?; + let timeline_id = TimelineId::from_str(&timeline_id_str)?; + let mconf = Self::get_mconf(&mut msg_bytes)?; + let pg_version = msg_bytes + .get_u32_f() + .with_context(|| "reading pg_version")?; + let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?; + let wal_seg_size = msg_bytes + .get_u32_f() + .with_context(|| "reading wal_seg_size")?; + let g = ProposerGreeting { + tenant_id, + timeline_id, + mconf, + pg_version, + system_id, + wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) } + 'v' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let v = VoteRequest { generation, term }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let start_streaming_at: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading start_streaming_at")? + .into(); + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + let msg = ProposerElected { + generation, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let begin_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading begin_lsn")? + .into(); + let end_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading end_lsn")? + .into(); + let commit_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading commit_lsn")? + .into(); + let truncate_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading truncate_lsn")? + .into(); + let hdr = AppendRequestHeader { + generation, + term, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + if msg_bytes.remaining() < rec_size { + bail!( + "reading WAL: only {} bytes left, wanted {}", + msg_bytes.remaining(), + rec_size + ); + } + let wal_data = msg_bytes.copy_to_bytes(rec_size); + let msg = AppendRequest { h: hdr, wal_data }; - let mut wal_data_vec: Vec = vec![0; rec_size]; - stream.read_exact(&mut wal_data_vec)?; - let wal_data = Bytes::from(wal_data_vec); - let msg = AppendRequest { h: hdr, wal_data }; - - Ok(ProposerAcceptorMessage::AppendRequest(msg)) + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), } - _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } else if proto_version == SK_PROTO_VERSION_2 { + // xxx using Reader is inefficient but easy to work with bincode + let mut stream = msg_bytes.reader(); + // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is + let tag = stream.read_u64::()? as u8 as char; + match tag { + 'g' => { + let msgv2 = ProposerGreetingV2::des_from(&mut stream)?; + let g = ProposerGreeting { + tenant_id: msgv2.tenant_id, + timeline_id: msgv2.timeline_id, + mconf: membership::Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + }, + pg_version: msgv2.pg_version, + system_id: msgv2.system_id, + wal_seg_size: msgv2.wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) + } + 'v' => { + let msg = VoteRequestV2::des_from(&mut stream)?; + let v = VoteRequest { + generation: INVALID_GENERATION, + term: msg.term, + }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let mut msg_bytes = stream.into_inner(); + if msg_bytes.remaining() < 16 { + bail!("ProposerElected message is not complete"); + } + let term = msg_bytes.get_u64_le(); + let start_streaming_at = msg_bytes.get_u64_le().into(); + let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let _timeline_start_lsn = msg_bytes.get_u64_le(); + let msg = ProposerElected { + generation: INVALID_GENERATION, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + // read header followed by wal data + let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?; + let hdr = AppendRequestHeader { + generation: INVALID_GENERATION, + term: hdrv2.term, + begin_lsn: hdrv2.begin_lsn, + end_lsn: hdrv2.end_lsn, + commit_lsn: hdrv2.commit_lsn, + truncate_lsn: hdrv2.truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + + let mut wal_data_vec: Vec = vec![0; rec_size]; + stream.read_exact(&mut wal_data_vec)?; + let wal_data = Bytes::from(wal_data_vec); + + let msg = AppendRequest { h: hdr, wal_data }; + + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } + } else { + bail!("unsupported protocol version {}", proto_version); } } @@ -394,36 +703,21 @@ impl ProposerAcceptorMessage { // We explicitly list all fields, to draw attention here when new fields are added. let mut size = BASE_SIZE; size += match self { - Self::Greeting(ProposerGreeting { - protocol_version: _, - pg_version: _, - proposer_id: _, - system_id: _, - timeline_id: _, - tenant_id: _, - tli: _, - wal_seg_size: _, - }) => 0, + Self::Greeting(_) => 0, - Self::VoteRequest(VoteRequest { term: _ }) => 0, + Self::VoteRequest(_) => 0, - Self::Elected(ProposerElected { - term: _, - start_streaming_at: _, - term_history: _, - timeline_start_lsn: _, - }) => 0, + Self::Elected(_) => 0, Self::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -431,13 +725,12 @@ impl ProposerAcceptorMessage { Self::NoFlushAppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -458,45 +751,118 @@ pub enum AcceptorProposerMessage { } impl AcceptorProposerMessage { - /// Serialize acceptor -> proposer message. - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - match self { - AcceptorProposerMessage::Greeting(msg) => { - buf.put_u64_le('g' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.node_id.0); - } - AcceptorProposerMessage::VoteResponse(msg) => { - buf.put_u64_le('v' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.vote_given); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.truncate_lsn.into()); - buf.put_u32_le(msg.term_history.0.len() as u32); - for e in &msg.term_history.0 { - buf.put_u64_le(e.term); - buf.put_u64_le(e.lsn.into()); - } - buf.put_u64_le(msg.timeline_start_lsn.into()); - } - AcceptorProposerMessage::AppendResponse(msg) => { - buf.put_u64_le('a' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.commit_lsn.into()); - buf.put_i64_le(msg.hs_feedback.ts); - buf.put_u64_le(msg.hs_feedback.xmin); - buf.put_u64_le(msg.hs_feedback.catalog_xmin); + fn put_cstr(buf: &mut BytesMut, s: &str) { + buf.put_slice(s.as_bytes()); + buf.put_u8(0); // null terminator + } - // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback - // if it is not present. - if let Some(ref msg) = msg.pageserver_feedback { - msg.serialize(buf); - } - } + /// Serialize membership::Configuration into buf. + fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) { + buf.put_u32(mconf.generation.into_inner()); + buf.put_u32(mconf.members.m.len() as u32); + for sk in &mconf.members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); } + if let Some(ref new_members) = mconf.new_members { + buf.put_u32(new_members.m.len() as u32); + for sk in &new_members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); + } + } else { + buf.put_u32(0); + } + } - Ok(()) + /// Serialize acceptor -> proposer message. + pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> { + if proto_version == SK_PROTO_VERSION_3 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u8(b'g'); + buf.put_u64(msg.node_id.0); + Self::serialize_mconf(buf, &msg.mconf); + buf.put_u64(msg.term) + } + AcceptorProposerMessage::VoteResponse(msg) => { + buf.put_u8(b'v'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u8(msg.vote_given as u8); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.truncate_lsn.into()); + buf.put_u32(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64(e.term); + buf.put_u64(e.lsn.into()); + } + } + AcceptorProposerMessage::AppendResponse(msg) => { + buf.put_u8(b'a'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.commit_lsn.into()); + buf.put_i64(msg.hs_feedback.ts); + buf.put_u64(msg.hs_feedback.xmin); + buf.put_u64(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + // TODO remove 3 after converting all msgs + } else if proto_version == SK_PROTO_VERSION_2 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u64_le('g' as u64); + // v2 didn't have mconf and fields were reordered + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); + } + AcceptorProposerMessage::VoteResponse(msg) => { + // v2 didn't have generation, had u64 vote_given and timeline_start_lsn + buf.put_u64_le('v' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.vote_given as u64); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.truncate_lsn.into()); + buf.put_u32_le(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64_le(e.term); + buf.put_u64_le(e.lsn.into()); + } + // removed timeline_start_lsn + buf.put_u64_le(0); + } + AcceptorProposerMessage::AppendResponse(msg) => { + // v2 didn't have generation + buf.put_u64_le('a' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.commit_lsn.into()); + buf.put_i64_le(msg.hs_feedback.ts); + buf.put_u64_le(msg.hs_feedback.xmin); + buf.put_u64_le(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + } else { + bail!("unsupported protocol version {}", proto_version); + } } } @@ -593,14 +959,6 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - // Check protocol compatibility - if msg.protocol_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - msg.protocol_version, - SK_PROTOCOL_VERSION - ); - } /* Postgres major version mismatch is treated as fatal error * because safekeepers parse WAL headers and the format * may change between versions. @@ -655,15 +1013,19 @@ where self.state.finish_change(&state).await?; } - info!( - "processed greeting from walproposer {}, sending term {:?}", - msg.proposer_id.map(|b| format!("{:X}", b)).join(""), - self.state.acceptor_state.term - ); - Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.state.acceptor_state.term, + // Switch into conf given by proposer conf if it is higher. + self.state.membership_switch(msg.mconf.clone()).await?; + + let apg = AcceptorGreeting { node_id: self.node_id, - }))) + mconf: self.state.mconf.clone(), + term: self.state.acceptor_state.term, + }; + info!( + "processed greeting {:?} from walproposer, sending {:?}", + msg, apg + ); + Ok(Some(AcceptorProposerMessage::Greeting(apg))) } /// Give vote for the given term, if we haven't done that previously. @@ -671,25 +1033,27 @@ where &mut self, msg: &VoteRequest, ) -> Result> { + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } // Once voted, we won't accept data from older proposers; flush // everything we've already received so that new proposer starts - // streaming at end of our WAL, without overlap. Currently we truncate - // WAL at streaming point, so this avoids truncating already committed - // WAL. - // - // TODO: it would be smoother to not truncate committed piece at - // handle_elected instead. Currently not a big deal, as proposer is the - // only source of WAL; with peer2peer recovery it would be more - // important. + // streaming at end of our WAL, without overlap. WAL is truncated at + // streaming point and commit_lsn may be advanced from peers, so this + // also avoids possible spurious attempt to truncate committed WAL. self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, - vote_given: false as u64, + vote_given: false, flush_lsn: self.flush_lsn(), truncate_lsn: self.state.inmem.peer_horizon_lsn, term_history: self.get_term_history(), - timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); @@ -698,15 +1062,16 @@ where self.state.finish_change(&state).await?; resp.term = self.state.acceptor_state.term; - resp.vote_given = true as u64; + resp.vote_given = true; } - info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); + info!("processed {:?}: sending {:?}", msg, &resp); Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, @@ -733,6 +1098,13 @@ where self.get_last_log_term(), self.flush_lsn() ); + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -769,9 +1141,14 @@ where // and walproposer recalculates the streaming point. OTOH repeating // error indicates a serious bug. if last_common_point.lsn != msg.start_streaming_at { - bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - last_common_point, msg.start_streaming_at, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + bail!( + "refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, + msg.start_streaming_at, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); } @@ -779,8 +1156,12 @@ where assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - msg.start_streaming_at, self.state.inmem.commit_lsn, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + msg.start_streaming_at, + self.state.inmem.commit_lsn, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -805,18 +1186,22 @@ where // Here we learn initial LSN for the first time, set fields // interested in that. - if state.timeline_start_lsn == Lsn(0) { - // Remember point where WAL begins globally. - state.timeline_start_lsn = msg.timeline_start_lsn; - info!( - "setting timeline_start_lsn to {:?}", - state.timeline_start_lsn - ); + if let Some(start_lsn) = msg.term_history.0.first() { + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. In the future it + // will be intialized immediately on timeline creation. + state.timeline_start_lsn = start_lsn.lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } } + if state.peer_horizon_lsn == Lsn(0) { // Update peer_horizon_lsn as soon as we know where timeline starts. // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn. - state.peer_horizon_lsn = msg.timeline_start_lsn; + state.peer_horizon_lsn = state.timeline_start_lsn; } if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; @@ -890,13 +1275,29 @@ where msg: &AppendRequest, require_flush: bool, ) -> Result> { + // Refuse message on generation mismatch. On reconnect wp will get full + // configuration from greeting. + if self.state.mconf.generation != msg.h.generation { + bail!( + "refusing append request due to generation mismatch: request {}, sk {}", + msg.h.generation, + self.state.mconf.generation + ); + } + if self.state.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); } - // If our term is higher, immediately refuse the message. + // If our term is higher, immediately refuse the message. Send term only + // response; elected walproposer can never advance the term, so it will + // figure out the refusal from it -- which is important as term change + // should cause not just reconnection but whole walproposer re-election. if self.state.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.state.acceptor_state.term); + let resp = AppendResponse::term_only( + self.state.mconf.generation, + self.state.acceptor_state.term, + ); return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } @@ -924,10 +1325,8 @@ where ); } - // Now we know that we are in the same term as the proposer, - // processing the message. - - self.state.inmem.proposer_uuid = msg.h.proposer_uuid; + // Now we know that we are in the same term as the proposer, process the + // message. // do the job if !msg.wal_data.is_empty() { @@ -1000,21 +1399,19 @@ where #[cfg(test)] mod tests { - use futures::future::BoxFuture; + use std::ops::Deref; + use std::str::FromStr; + use std::time::{Instant, UNIX_EPOCH}; - use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; - use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, - ServerInfo, + use futures::future::BoxFuture; + use postgres_ffi::{WAL_SEGMENT_SIZE, XLogSegNo}; + use safekeeper_api::ServerInfo; + use safekeeper_api::membership::{ + Configuration, MemberSet, SafekeeperGeneration, SafekeeperId, }; use super::*; use crate::state::{EvictionState, TimelinePersistentState}; - use std::{ - ops::Deref, - str::FromStr, - time::{Instant, UNIX_EPOCH}, - }; // fake storage for tests struct InMemoryState { @@ -1096,11 +1493,21 @@ mod tests { let wal_store = DummyWalStore { lsn: Lsn(0) }; let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + // Vote with generation mismatch should be rejected. + let gen_mismatch_vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: SafekeeperGeneration::new(42), + term: 1, + }); + assert!(sk.process_msg(&gen_mismatch_vote_request).await.is_err()); + // check voting for 1 is ok - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: Generation::new(0), + term: 1, + }); let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), r => panic!("unexpected response: {:?}", r), } @@ -1115,7 +1522,7 @@ mod tests { // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), r => panic!("unexpected response: {:?}", r), } } @@ -1130,13 +1537,12 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 2, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let mut append_request = AppendRequest { h: ar_hdr.clone(), @@ -1144,6 +1550,7 @@ mod tests { }; let pem = ProposerElected { + generation: Generation::new(0), term: 2, start_streaming_at: Lsn(1), term_history: TermHistory(vec![ @@ -1156,8 +1563,17 @@ mod tests { lsn: Lsn(3), }, ]), - timeline_start_lsn: Lsn(1), }; + + // check that elected msg with generation mismatch is rejected + let mut pem_gen_mismatch = pem.clone(); + pem_gen_mismatch.generation = SafekeeperGeneration::new(42); + assert!( + sk.process_msg(&ProposerAcceptorMessage::Elected(pem_gen_mismatch)) + .await + .is_err() + ); + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); @@ -1191,32 +1607,46 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let pem = ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: Lsn(1), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(1), }]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); let ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let append_request = AppendRequest { h: ar_hdr.clone(), wal_data: Bytes::from_static(b"b"), }; + // check that append request with generation mismatch is rejected + let mut ar_hdr_gen_mismatch = ar_hdr.clone(); + ar_hdr_gen_mismatch.generation = SafekeeperGeneration::new(42); + let append_request_gen_mismatch = AppendRequest { + h: ar_hdr_gen_mismatch, + wal_data: Bytes::from_static(b"b"), + }; + assert!( + sk.process_msg(&ProposerAcceptorMessage::AppendRequest( + append_request_gen_mismatch + )) + .await + .is_err() + ); + // do write ending at 2, it should be ok sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) .await diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index fb06339604..e196f91d3c 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -3,23 +3,22 @@ use std::fmt::Display; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; -use futures::future::Either; +use anyhow::{Context, anyhow}; use futures::StreamExt; +use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::waldecoder::WalDecodeError; -use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; +use postgres_ffi::get_current_timestamp; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use utils::critical; use utils::lsn::Lsn; -use utils::postgres_client::Compression; -use utils::postgres_client::InterpretedFormat; +use utils::postgres_client::{Compression, InterpretedFormat}; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; use wal_decoder::wire_format::ToWireFormat; @@ -100,7 +99,12 @@ struct ShardSenderState { /// State of [`InterpretedWalReader`] visible outside of the task running it. #[derive(Debug)] pub(crate) enum InterpretedWalReaderState { - Running { current_position: Lsn }, + Running { + current_position: Lsn, + /// Tracks the start of the PG WAL LSN from which the current batch of + /// interpreted records originated. + current_batch_wal_start: Option, + }, Done, } @@ -122,14 +126,21 @@ pub enum InterpretedWalReaderError { } enum CurrentPositionUpdate { - Reset(Lsn), + Reset { from: Lsn, to: Lsn }, NotReset(Lsn), } impl CurrentPositionUpdate { fn current_position(&self) -> Lsn { match self { - CurrentPositionUpdate::Reset(lsn) => *lsn, + CurrentPositionUpdate::Reset { from: _, to } => *to, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } + + fn previous_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset { from, to: _ } => *from, CurrentPositionUpdate::NotReset(lsn) => *lsn, } } @@ -145,16 +156,33 @@ impl InterpretedWalReaderState { } } + #[cfg(test)] + fn current_batch_wal_start(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => *current_batch_wal_start, + InterpretedWalReaderState::Done => None, + } + } + // Reset the current position of the WAL reader if the requested starting position // of the new shard is smaller than the current value. fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate { match self { InterpretedWalReaderState::Running { - current_position, .. + current_position, + current_batch_wal_start, } => { if new_shard_start_pos < *current_position { + let from = *current_position; *current_position = new_shard_start_pos; - CurrentPositionUpdate::Reset(*current_position) + *current_batch_wal_start = None; + CurrentPositionUpdate::Reset { + from, + to: *current_position, + } } else { CurrentPositionUpdate::NotReset(*current_position) } @@ -164,6 +192,47 @@ impl InterpretedWalReaderState { } } } + + fn update_current_batch_wal_start(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => { + if current_batch_wal_start.is_none() { + *current_batch_wal_start = Some(lsn); + } + } + InterpretedWalReaderState::Done => { + panic!("update_current_batch_wal_start called on finished reader") + } + } + } + + fn take_current_batch_wal_start(&mut self) -> Lsn { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => current_batch_wal_start.take().unwrap(), + InterpretedWalReaderState::Done => { + panic!("take_current_batch_wal_start called on finished reader") + } + } + } + + fn update_current_position(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => { + *current_position = lsn; + } + InterpretedWalReaderState::Done => { + panic!("update_current_position called on finished reader") + } + } + } } pub(crate) struct AttachShardNotification { @@ -184,6 +253,7 @@ impl InterpretedWalReader { ) -> InterpretedWalReaderHandle { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); @@ -237,9 +307,13 @@ impl InterpretedWalReader { tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, pg_version: u32, + shard_notification_rx: Option< + tokio::sync::mpsc::UnboundedReceiver, + >, ) -> InterpretedWalReader { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { current_position: start_pos, + current_batch_wal_start: None, })); InterpretedWalReader { @@ -252,7 +326,7 @@ impl InterpretedWalReader { next_record_lsn: start_pos, }], )]), - shard_notification_rx: None, + shard_notification_rx, state: state.clone(), pg_version, } @@ -295,10 +369,6 @@ impl InterpretedWalReader { let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); - // Tracks the start of the PG WAL LSN from which the current batch of - // interpreted records originated. - let mut current_batch_wal_start_lsn: Option = None; - loop { tokio::select! { // Main branch for reading WAL and forwarding it @@ -319,11 +389,7 @@ impl InterpretedWalReader { } }; - // We will already have a value if the previous chunks of WAL - // did not decode into anything useful. - if current_batch_wal_start_lsn.is_none() { - current_batch_wal_start_lsn = Some(wal_start_lsn); - } + self.state.write().unwrap().update_current_batch_wal_start(wal_start_lsn); wal_decoder.feed_bytes(&wal); @@ -380,16 +446,11 @@ impl InterpretedWalReader { // Update the current position such that new receivers can decide // whether to attach to us or spawn a new WAL reader. - match &mut *self.state.write().unwrap() { - InterpretedWalReaderState::Running { current_position, .. } => { - *current_position = max_next_record_lsn; - }, - InterpretedWalReaderState::Done => { - unreachable!() - } - } - - let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap(); + let batch_wal_start_lsn = { + let mut guard = self.state.write().unwrap(); + guard.update_current_position(max_next_record_lsn); + guard.take_current_batch_wal_start() + }; // Send interpreted records downstream. Anything that has already been seen // by a shard is filtered out. @@ -480,7 +541,7 @@ impl InterpretedWalReader { // anything outside the select statement. let position_reset = self.state.write().unwrap().maybe_reset(start_pos); match position_reset { - CurrentPositionUpdate::Reset(to) => { + CurrentPositionUpdate::Reset { from: _, to } => { self.wal_stream.reset(to).await; wal_decoder = WalStreamDecoder::new(to, self.pg_version); }, @@ -488,14 +549,22 @@ impl InterpretedWalReader { }; tracing::info!( - "Added shard sender {} with start_pos={} current_pos={}", - ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position() + "Added shard sender {} with start_pos={} previous_pos={} current_pos={}", + ShardSenderId::new(shard_id, new_sender_id), + start_pos, + position_reset.previous_position(), + position_reset.current_position(), ); } } } } } + + #[cfg(test)] + fn state(&self) -> Arc> { + self.state.clone() + } } impl InterpretedWalReaderHandle { @@ -621,22 +690,20 @@ impl InterpretedWalSender<'_, IO> { } #[cfg(test)] mod tests { - use std::{collections::HashMap, str::FromStr, time::Duration}; + use std::collections::HashMap; + use std::str::FromStr; + use std::time::Duration; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::MAX_SEND_SIZE; use tokio::sync::mpsc::error::TryRecvError; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - shard::{ShardCount, ShardNumber}, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; + use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - send_interpreted_wal::{Batch, InterpretedWalReader}, - test_utils::Env, - wal_reader_stream::StreamingWalReader, - }; + use crate::send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_interpreted_wal_reader_fanout() { @@ -738,9 +805,11 @@ mod tests { // This test uses logical messages. Those only go to shard 0. Check that the // filtering worked and shard 1 did not get any. - assert!(shard_1_interpreted_records - .iter() - .all(|recs| recs.records.is_empty())); + assert!( + shard_1_interpreted_records + .iter() + .all(|recs| recs.records.is_empty()) + ); // Shard 0 should not receive anything more since the reader is // going through wal that it has already processed. @@ -913,4 +982,128 @@ mod tests { assert_eq!(sender.received_next_record_lsns, expected); } } + + #[tokio::test] + async fn test_batch_start_tracking_on_reset() { + // When the WAL stream is reset to an older LSN, + // the current batch start LSN should be invalidated. + // This test constructs such a scenario: + // 1. Shard 0 is reading somewhere ahead + // 2. Reader reads some WAL, but does not decode a full record (partial read) + // 3. Shard 1 attaches to the reader and resets it to an older LSN + // 4. Shard 1 should get the correct batch WAL start LSN + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 64 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + const WAL_READER_BATCH_SIZE: usize = 8192; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let mut next_record_lsns = Vec::default(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = + Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) + .await + .unwrap(); + + assert!(next_record_lsns.len() > 3); + let shard_0_start_lsn = next_record_lsns[3]; + + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + shard_0_start_lsn, + end_pos, + end_watch, + WAL_READER_BATCH_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader::new( + streaming_wal_reader, + shard_0_start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + Some(shard_notification_rx), + ); + + let reader_state = reader.state(); + let mut reader_fut = std::pin::pin!(reader.run(shard_0_start_lsn, &None)); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let guard = reader_state.read().unwrap(); + if guard.current_batch_wal_start().is_some() { + break; + } + } + + shard_notification_tx + .send(AttachShardNotification { + shard_id: shard_1, + sender: shards.get_mut(&shard_1).unwrap().0.take().unwrap(), + start_pos: start_lsn, + }) + .unwrap(); + + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let try_recv_res = shard_1_rx.try_recv(); + match try_recv_res { + Ok(batch) => { + assert_eq!(batch.records.raw_wal_start_lsn.unwrap(), start_lsn); + break; + } + Err(tokio::sync::mpsc::error::TryRecvError::Empty) => {} + Err(tokio::sync::mpsc::error::TryRecvError::Disconnected) => { + unreachable!(); + } + } + } + } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4a4a74a0fd..33e3d0485c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,6 +1,34 @@ //! This module implements the streaming side of replication protocol, starting //! with the "START_REPLICATION" message, and registry of walsenders. +use std::cmp::{max, min}; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context as AnyhowContext, bail}; +use bytes::Bytes; +use futures::FutureExt; +use itertools::Itertools; +use parking_lot::Mutex; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; +use safekeeper_api::Term; +use safekeeper_api::models::{ + HotStandbyFeedback, INVALID_FULL_TRANSACTION_ID, ReplicationFeedback, StandbyFeedback, + StandbyReply, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::watch::Receiver; +use tokio::time::timeout; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::failpoint_support; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; + use crate::handler::SafekeeperPostgresHandler; use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS}; use crate::receive_wal::WalReceivers; @@ -11,34 +39,6 @@ use crate::send_interpreted_wal::{ use crate::timeline::WalResidentTimeline; use crate::wal_reader_stream::StreamingWalReader; use crate::wal_storage::WalReader; -use anyhow::{bail, Context as AnyhowContext}; -use bytes::Bytes; -use futures::FutureExt; -use parking_lot::Mutex; -use postgres_backend::PostgresBackend; -use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; -use postgres_ffi::get_current_timestamp; -use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use safekeeper_api::models::{ - HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, - INVALID_FULL_TRANSACTION_ID, -}; -use safekeeper_api::Term; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::failpoint_support; -use utils::pageserver_feedback::PageserverFeedback; -use utils::postgres_client::PostgresClientProtocol; - -use itertools::Itertools; -use std::cmp::{max, min}; -use std::net::SocketAddr; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::watch::Receiver; -use tokio::time::timeout; -use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -624,8 +624,9 @@ impl SafekeeperPostgresHandler { MAX_SEND_SIZE, ); - let reader = - InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version); + let reader = InterpretedWalReader::new( + wal_reader, start_pos, tx, shard, pg_version, None, + ); let sender = InterpretedWalSender { format, @@ -905,9 +906,9 @@ impl WalSender<'_, IO> { // pageserver to identify WalReceiverError::SuccessfulCompletion, // do not change this string without updating pageserver. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); } } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 4d566b12a0..7533005c35 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,28 +1,24 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref, time::SystemTime}; +use std::cmp::max; +use std::ops::Deref; +use std::time::SystemTime; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::{ - membership::Configuration, - models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}, - ServerInfo, Term, INITIAL_TERM, -}; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; -use crate::{ - control_file, - safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}, - timeline::TimelineError, - wal_backup_partial::{self}, -}; +use crate::control_file; +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}; +use crate::timeline::TimelineError; +use crate::wal_backup_partial::{self}; /// Persistent information stored on safekeeper node about timeline. /// On disk data is prefixed by magic and format version and followed by checksum. @@ -272,7 +268,7 @@ where // Is switch allowed? if to.generation <= self.mconf.generation { info!( - "ignoring request to switch membership conf to lower {}, current conf {}", + "ignoring request to switch membership conf to {}, current conf {}", to, self.mconf ); } else { diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 79ceddd366..e6f74185c1 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,5 +1,12 @@ use std::sync::Arc; +use camino_tempfile::Utf8TempDir; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use safekeeper_api::membership::SafekeeperGeneration as Generation; +use tokio::fs::create_dir_all; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + use crate::rate_limit::RateLimiter; use crate::receive_wal::WalAcceptor; use crate::safekeeper::{ @@ -8,15 +15,10 @@ use crate::safekeeper::{ }; use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; -use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::remote_timeline_path; -use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf}; -use camino_tempfile::Utf8TempDir; -use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; -use tokio::fs::create_dir_all; -use utils::id::{NodeId, TenantTimelineId}; -use utils::lsn::Lsn; +use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. pub struct Env { @@ -73,10 +75,10 @@ impl Env { // Emulate an initial election. safekeeper .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: start_lsn, term_history: TermHistory(vec![(1, start_lsn).into()]), - timeline_start_lsn: start_lsn, })) .await?; @@ -146,13 +148,12 @@ impl Env { let req = AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: start_lsn, begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: lsn, truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4341f13824..930f66a207 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,37 +1,32 @@ //! This module implements Timeline lifecycle management and has all necessary code //! to glue together SafeKeeper and all other background services. -use anyhow::{anyhow, bail, Result}; +use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::Duration; + +use anyhow::{Result, anyhow, bail}; use camino::{Utf8Path, Utf8PathBuf}; +use http_utils::error::ApiError; use remote_storage::RemotePath; +use safekeeper_api::Term; use safekeeper_api::membership::Configuration; use safekeeper_api::models::{ PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse, }; -use safekeeper_api::Term; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::fs::{self}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, watch}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use tracing::*; +use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::lsn::Lsn; use utils::sync::gate::Gate; -use http_utils::error::ApiError; -use std::cmp::max; -use std::ops::{Deref, DerefMut}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Duration; -use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; -use tokio::{sync::watch, time::Instant}; -use tracing::*; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; - -use crate::control_file; +use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics}; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; @@ -42,11 +37,8 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; - -use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; -use crate::SafeKeeperConf; -use crate::{debug_dump, timeline_manager, wal_storage}; +use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { PeerInfo { @@ -168,7 +160,7 @@ impl StateSK { pub fn state(&self) -> &TimelineState { match self { StateSK::Loaded(sk) => &sk.state, - StateSK::Offloaded(ref s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -176,7 +168,7 @@ impl StateSK { pub fn state_mut(&mut self) -> &mut TimelineState { match self { StateSK::Loaded(sk) => &mut sk.state, - StateSK::Offloaded(ref mut s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -566,11 +558,18 @@ impl Timeline { }); } - /// Background timeline activities (which hold Timeline::gate) will no - /// longer run once this function completes. - pub async fn shutdown(&self) { + /// Cancel the timeline, requesting background activity to stop. Closing + /// the `self.gate` waits for that. + pub async fn cancel(&self) { info!("timeline {} shutting down", self.ttid); self.cancel.cancel(); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. `Self::cancel` must have been + /// already called. + pub async fn close(&self) { + assert!(self.cancel.is_cancelled()); // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts // to read deleted files. @@ -582,13 +581,13 @@ impl Timeline { /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. /// - /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) + /// Timeline must be in shut-down state (i.e. call [`Self::close`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - // Assert that [`Self::shutdown`] was already called + // Assert that [`Self::close`] was already called assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); @@ -1114,7 +1113,7 @@ impl ManagerTimeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -async fn delete_dir(path: &Utf8PathBuf) -> Result { +pub async fn delete_dir(path: &Utf8PathBuf) -> Result { match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 303421c837..06ccb32d03 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -7,23 +7,19 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; -use tokio::{ - fs::File, - io::{AsyncRead, AsyncWriteExt}, -}; +use tokio::fs::File; +use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; -use crate::{ - metrics::{ - EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, - }, - rate_limit::rand_duration, - timeline_manager::{Manager, StateSnapshot}, - wal_backup, - wal_backup_partial::{self, PartialRemoteSegment}, - wal_storage::wal_file_paths, +use crate::metrics::{ + EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, EvictionEvent, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::rand_duration; +use crate::timeline_manager::{Manager, StateSnapshot}; +use crate::wal_backup; +use crate::wal_backup_partial::{self, PartialRemoteSegment}; +use crate::wal_storage::wal_file_paths; impl Manager { /// Returns true if the timeline is ready for eviction. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index a33994dcab..71e99a4de7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -7,41 +7,36 @@ //! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. //! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. -use std::{ - sync::{atomic::AtomicUsize, Arc}, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::time::Duration; use futures::channel::oneshot; use postgres_ffi::XLogSegNo; -use safekeeper_api::{models::PeerInfo, Term}; +use safekeeper_api::Term; +use safekeeper_api::models::PeerInfo; use serde::{Deserialize, Serialize}; -use tokio::{ - task::{JoinError, JoinHandle}, - time::Instant, -}; +use tokio::task::{JoinError, JoinHandle}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; use utils::lsn::Lsn; -use crate::{ - control_file::{FileStorage, Storage}, - metrics::{ - MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, - NUM_EVICTED_TIMELINES, - }, - rate_limit::{rand_duration, RateLimiter}, - recovery::recovery_main, - remove_wal::calc_horizon_lsn, - send_wal::WalSenders, - state::TimelineState, - timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}, - timeline_guard::{AccessService, GuardId, ResidenceGuard}, - timelines_set::{TimelineSetGuard, TimelinesSet}, - wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::control_file::{FileStorage, Storage}; +use crate::metrics::{ + MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::recovery::recovery_main; +use crate::remove_wal::calc_horizon_lsn; +use crate::send_wal::WalSenders; +use crate::state::TimelineState; +use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; +use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; +use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; +use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { // inmem values diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 1ff6a72bce..858dfce807 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,31 +2,33 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. -use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; -use crate::rate_limit::RateLimiter; -use crate::state::TimelinePersistentState; -use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; -use crate::timelines_set::TimelinesSet; -use crate::wal_storage::Storage; -use crate::{control_file, wal_storage, SafeKeeperConf}; -use anyhow::{bail, Context, Result}; +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; use safekeeper_api::models::SafekeeperUtilization; -use safekeeper_api::ServerInfo; +use safekeeper_api::{ServerInfo, membership}; use serde::Serialize; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{Arc, Mutex}; -use std::time::{Duration, Instant}; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::http::routes::DeleteOrExcludeError; +use crate::rate_limit::RateLimiter; +use crate::state::TimelinePersistentState; +use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; +use crate::timelines_set::TimelinesSet; +use crate::wal_storage::Storage; +use crate::{SafeKeeperConf, control_file, wal_storage}; + // Timeline entry in the global map: either a ready timeline, or mark that it is // being created. #[derive(Clone)] @@ -446,23 +448,20 @@ impl GlobalTimelines { .collect() } - /// Cancels timeline, then deletes the corresponding data directory. - /// If only_local, doesn't remove WAL segments in remote storage. - pub(crate) async fn delete( + /// Delete timeline, only locally on this node or globally (also cleaning + /// remote storage WAL), depending on `action` value. + pub(crate) async fn delete_or_exclude( &self, ttid: &TenantTimelineId, - only_local: bool, - ) -> Result { + action: DeleteOrExclude, + ) -> Result { let tli_res = { let state = self.state.lock().unwrap(); if state.tombstones.contains_key(ttid) { // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. info!("Timeline {ttid} was already deleted"); - return Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active: false, - }); + return Ok(TimelineDeleteResult { dir_existed: false }); } state.get(ttid) @@ -470,32 +469,47 @@ impl GlobalTimelines { let result = match tli_res { Ok(timeline) => { - let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, action={:?}", ttid, action); - info!("deleting timeline {}, only_local={}", ttid, only_local); - timeline.shutdown().await; + // If node is getting excluded, check the generation first. + // Then, while holding the lock cancel the timeline; it will be + // unusable after this point, and if node is added back first + // deletion must be completed and node seeded anew. + // + // We would like to avoid holding the lock while waiting for the + // gate to finish as this is deadlock prone, so for actual + // deletion will take it second time. + if let DeleteOrExclude::Exclude(ref mconf) = action { + let shared_state = timeline.read_shared_state().await; + if shared_state.sk.state().mconf.generation > mconf.generation { + return Err(DeleteOrExcludeError::Conflict { + requested: mconf.clone(), + current: shared_state.sk.state().mconf.clone(), + }); + } + timeline.cancel().await; + } else { + timeline.cancel().await; + } + + timeline.close().await; info!("timeline {ttid} shut down for deletion"); // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; + let only_local = !matches!(action, DeleteOrExclude::Delete); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active, // TODO: we probably should remove this field - }) + Ok(TimelineDeleteResult { dir_existed }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid); - let dir_existed = delete_dir(dir_path)?; + let dir_existed = delete_dir(&dir_path).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active: false, - }) + Ok(TimelineDeleteResult { dir_existed }) } }; @@ -513,11 +527,11 @@ impl GlobalTimelines { /// retry tenant deletion again later. /// /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete_force_all_for_tenant( + pub async fn delete_all_for_tenant( &self, tenant_id: &TenantId, - only_local: bool, - ) -> Result> { + action: DeleteOrExclude, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let to_delete = self.get_all_for_tenant(*tenant_id); @@ -525,7 +539,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match self.delete(&tli.ttid, only_local).await { + match self.delete_or_exclude(&tli.ttid, action.clone()).await { Ok(result) => { deleted.insert(tli.ttid, result); } @@ -539,17 +553,15 @@ impl GlobalTimelines { // If there was an error, return it. if let Some(e) = err { - return Err(e); + return Err(anyhow::Error::from(e)); } // There may be broken timelines on disk, so delete the whole tenant dir as well. // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir(get_tenant_dir( - self.state.lock().unwrap().conf.as_ref(), - tenant_id, - ))?; + let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id); + delete_dir(&tenant_dir).await?; Ok(deleted) } @@ -568,18 +580,20 @@ impl GlobalTimelines { } #[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { +pub struct TimelineDeleteResult { pub dir_existed: bool, - pub was_active: bool, } -/// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: Utf8PathBuf) -> Result { - match std::fs::remove_dir_all(path) { - Ok(_) => Ok(true), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(e) => Err(e.into()), - } +/// Action for delete_or_exclude. +#[derive(Clone, Debug)] +pub enum DeleteOrExclude { + /// Delete timeline globally. + Delete, + /// Legacy mode until we fully migrate to generations: like exclude deletes + /// timeline only locally, but ignores generation number. + DeleteLocal, + /// This node is getting excluded, delete timeline locally. + Exclude(membership::Configuration), } /// Create temp directory for a new timeline. It needs to be located on the same diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index 096e348295..1d1abc530f 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TenantTimelineId; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 2f6b91cf47..6176e64698 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,34 +1,29 @@ -use anyhow::{Context, Result}; - -use camino::{Utf8Path, Utf8PathBuf}; -use futures::stream::FuturesOrdered; -use futures::StreamExt; -use safekeeper_api::models::PeerInfo; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; -use utils::backoff; -use utils::id::NodeId; - use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::time::Duration; +use anyhow::{Context, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use futures::stream::FuturesOrdered; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; -use postgres_ffi::XLogFileName; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, }; +use safekeeper_api::models::PeerInfo; use tokio::fs::File; - use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{watch, OnceCell}; +use tokio::sync::{OnceCell, watch}; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; use tracing::*; - -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::backoff; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 5ecb23e8e0..049852a048 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -20,23 +20,23 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. use camino::Utf8PathBuf; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::RemotePath; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; - use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; -use utils::{id::NodeId, lsn::Lsn}; +use utils::id::NodeId; +use utils::lsn::Lsn; -use crate::{ - metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, - rate_limit::{rand_duration, RateLimiter}, - timeline::WalResidentTimeline, - timeline_manager::StateSnapshot, - wal_backup::{self}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::metrics::{ + MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager::StateSnapshot; +use crate::wal_backup::{self}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index a0dd571a34..cc9d4e6e3b 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,14 +1,15 @@ -use std::{ - pin::Pin, - task::{Context, Poll}, -}; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::Bytes; -use futures::{stream::BoxStream, Stream, StreamExt}; +use futures::stream::BoxStream; +use futures::{Stream, StreamExt}; +use safekeeper_api::Term; use utils::lsn::Lsn; -use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader}; -use safekeeper_api::Term; +use crate::send_wal::EndWatch; +use crate::timeline::WalResidentTimeline; +use crate::wal_storage::WalReader; #[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { @@ -224,12 +225,11 @@ mod tests { use futures::StreamExt; use postgres_ffi::MAX_SEND_SIZE; - use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, - }; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; - use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; #[tokio::test] async fn test_streaming_wal_reader_reset() { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index e5ccbb3230..045fa88cb0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,23 +2,23 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::{Context, Result}; -use postgres_backend::QueryError; -use safekeeper_api::models::ConnectionId; +use std::os::fd::AsRawFd; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, Result}; +use postgres_backend::{AuthType, PostgresBackend, QueryError}; +use safekeeper_api::models::ConnectionId; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{auth::Scope, measured_stream::MeasuredStream}; - -use std::os::fd::AsRawFd; +use utils::auth::Scope; +use utils::measured_stream::MeasuredStream; +use crate::handler::SafekeeperPostgresHandler; use crate::metrics::TrafficMetrics; -use crate::SafeKeeperConf; -use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; -use postgres_backend::{AuthType, PostgresBackend}; +use crate::{GlobalTimelines, SafeKeeperConf}; /// Accept incoming TCP connections and spawn them into a background thread. /// diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e338d70731..f0bac4b40a 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,32 +7,32 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{bail, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::future::BoxFuture; -use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; -use remote_storage::RemotePath; use std::cmp::{max, min}; use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; -use tokio::fs::{self, remove_file, File, OpenOptions}; -use tokio::io::{AsyncRead, AsyncWriteExt}; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; + +use anyhow::{Context, Result, bail}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::future::BoxFuture; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use pq_proto::SystemId; +use remote_storage::RemotePath; +use tokio::fs::{self, File, OpenOptions, remove_file}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; use utils::crashsafe::durable_rename; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use crate::metrics::{ - time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, + REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::XLogFileName; -use pq_proto::SystemId; -use utils::{id::TenantTimelineId, lsn::Lsn}; pub trait Storage { // Last written LSN. @@ -200,7 +200,12 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { - bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn); + bail!( + "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", + ttid.timeline_id, + flush_lsn, + state.commit_lsn + ); } if flush_lsn < state.peer_horizon_lsn { warn!( @@ -569,6 +574,7 @@ impl Storage for PhysicalStorage { } self.pending_wal_truncation = false; + info!("truncated WAL to {}", end_pos); Ok(()) } diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e5b17a143..8e54d2bb86 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use tracing::{info, warn}; use utils::lsn::Lsn; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{ + Schedule, TestAction, TestConfig, generate_network_opts, generate_schedule, }; pub mod walproposer_sim; diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 1a932ef699..e29b58836a 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -1,11 +1,9 @@ use rand::Rng; use tracing::{info, warn}; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, TestConfig}, - simulation_logs::validate_events, -}; +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{TestConfig, generate_network_opts, generate_schedule}; +use crate::walproposer_sim::simulation_logs::validate_events; pub mod walproposer_sim; @@ -18,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let mut config = TestConfig::new(Some(clock)); for _ in 0..500 { - let seed: u64 = rand::thread_rng().gen(); + let seed: u64 = rand::thread_rng().r#gen(); config.network = generate_network_opts(seed); let test = config.start(seed); diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs index 0be9d0deef..f7b266e39c 100644 --- a/safekeeper/tests/simple_test.rs +++ b/safekeeper/tests/simple_test.rs @@ -1,7 +1,8 @@ use tracing::info; use utils::lsn::Lsn; -use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; +use crate::walproposer_sim::log::init_logger; +use crate::walproposer_sim::simulation::TestConfig; pub mod walproposer_sim; diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index 870f30de4f..e2ba3282ca 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -1,9 +1,11 @@ -use std::{fmt, sync::Arc}; +use std::fmt; +use std::sync::Arc; use desim::time::Timing; use once_cell::sync::OnceCell; use parking_lot::Mutex; -use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::time::FormatTime; /// SimClock can be plugged into tracing logger to print simulation time. #[derive(Clone)] diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 0023a4d22a..6ce1a9940e 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -2,33 +2,30 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Bytes, BytesMut}; use camino::Utf8PathBuf; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; use http::Uri; -use safekeeper::{ - safekeeper::{ - ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION, - }, - state::{TimelinePersistentState, TimelineState}, - timeline::TimelineError, - wal_storage::Storage, - SafeKeeperConf, +use safekeeper::SafeKeeperConf; +use safekeeper::safekeeper::{ + ProposerAcceptorMessage, SK_PROTO_VERSION_3, SafeKeeper, UNKNOWN_SERVER_VERSION, }; -use safekeeper_api::{membership::Configuration, ServerInfo}; +use safekeeper::state::{TimelinePersistentState, TimelineState}; +use safekeeper::timeline::TimelineError; +use safekeeper::wal_storage::Storage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; use tracing::{debug, info_span, warn}; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; @@ -287,7 +284,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { @@ -403,7 +400,7 @@ impl ConnState { // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn let mut buf = BytesMut::with_capacity(128); - reply.serialize(&mut buf)?; + reply.serialize(&mut buf, SK_PROTO_VERSION_3)?; self.tcp.send(AnyMessage::Bytes(buf.into())); } diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index b854754ecf..94a849b5f0 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -1,22 +1,23 @@ use std::collections::HashMap; +use std::ops::Deref; use std::sync::Arc; - -use parking_lot::Mutex; -use safekeeper::state::TimelinePersistentState; -use utils::id::TenantTimelineId; - -use super::block_storage::BlockStorage; - -use std::{ops::Deref, time::Instant}; +use std::time::Instant; use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; -use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; -use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use parking_lot::Mutex; +use postgres_ffi::XLogSegNo; +use postgres_ffi::waldecoder::WalStreamDecoder; +use safekeeper::metrics::WalStorageMetrics; +use safekeeper::state::TimelinePersistentState; +use safekeeper::{control_file, wal_storage}; use tracing::{debug, info}; +use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use super::block_storage::BlockStorage; + /// All safekeeper state that is usually saved to disk. pub struct SafekeeperDisk { pub timelines: Mutex>>, diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index fabf450eef..f314143952 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -1,23 +1,24 @@ -use std::{cell::Cell, str::FromStr, sync::Arc}; +use std::cell::Cell; +use std::str::FromStr; +use std::sync::Arc; -use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; -use desim::{ - executor::{self, ExternalHandle}, - node_os::NodeOs, - options::{Delay, NetworkOptions}, - proto::{AnyMessage, NodeEvent}, - world::Node, - world::World, -}; +use desim::executor::{self, ExternalHandle}; +use desim::node_os::NodeOs; +use desim::options::{Delay, NetworkOptions}; +use desim::proto::{AnyMessage, NodeEvent}; +use desim::world::{Node, World}; use rand::{Rng, SeedableRng}; use tracing::{debug, info_span, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use walproposer::walproposer::{Config, Wrapper}; -use super::{ - log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, - walproposer_disk::DiskWalProposer, -}; +use super::log::SimClock; +use super::safekeeper_disk::SafekeeperDisk; +use super::walproposer_api; +use super::walproposer_disk::DiskWalProposer; +use crate::walproposer_sim::safekeeper::run_server; +use crate::walproposer_sim::walproposer_api::SimulationApi; /// Simulated safekeeper node. pub struct SafekeeperNode { diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5578c94cf6..6451589e80 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -1,26 +1,20 @@ -use std::{ - cell::{RefCell, RefMut, UnsafeCell}, - ffi::CStr, - sync::Arc, -}; +use std::cell::{RefCell, RefMut, UnsafeCell}; +use std::ffi::CStr; +use std::sync::Arc; use bytes::Bytes; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, - world::NodeId, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; +use desim::world::NodeId; use tracing::debug; use utils::lsn::Lsn; -use walproposer::{ - api_bindings::Level, - bindings::{ - NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, - }, - walproposer::{ApiImpl, Config}, +use walproposer::api_bindings::Level; +use walproposer::bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }; +use walproposer::walproposer::{ApiImpl, Config}; use super::walproposer_disk::DiskWalProposer; @@ -578,7 +572,9 @@ impl ApiImpl for SimulationApi { let disk_lsn = disk.lock().flush_rec_ptr().0; debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); if startpos < disk_lsn { - debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + debug!( + "startpos < disk_lsn, it means we wrote some transaction even before streaming started" + ); } assert!(startpos <= disk_lsn); let mut broadcasted = Lsn(startpos); diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index 7dc7f48548..fe3eee8a5a 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -1,4 +1,5 @@ -use std::{ffi::CStr, sync::Arc}; +use std::ffi::CStr; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator}; diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 17d4aed63b..e4db9a317d 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_broker" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 1a6fb7fedf..86f2dd9a6c 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -1,18 +1,14 @@ -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use clap::Parser; - -use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + FilterTenantTimelineId, MessageType, SafekeeperTimelineInfo, SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, }; - use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; - use tonic::Request; const ABOUT: &str = r#" diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 9d4c22484c..cc33ec20ff 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -10,7 +10,14 @@ //! //! Only safekeeper message is supported, but it is not hard to add something //! else with generics. -use clap::{command, Parser}; +use std::collections::HashMap; +use std::convert::Infallible; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use clap::{Parser, command}; use futures_core::Stream; use futures_util::StreamExt; use http_body_util::Full; @@ -19,27 +26,10 @@ use hyper::header::CONTENT_TYPE; use hyper::service::service_fn; use hyper::{Method, StatusCode}; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use parking_lot::RwLock; -use std::collections::HashMap; -use std::convert::Infallible; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; -use std::time::Duration; -use tokio::net::TcpListener; -use tokio::sync::broadcast; -use tokio::sync::broadcast::error::RecvError; -use tokio::time; -use tonic::body::{self, empty_body, BoxBody}; -use tonic::codegen::Service; -use tonic::Code; -use tonic::{Request, Response, Status}; -use tracing::*; -use utils::signals::ShutdownSignals; - use metrics::{Encoder, TextEncoder}; +use parking_lot::RwLock; use storage_broker::metrics::{ - BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + BROADCAST_DROPPED_MESSAGES_TOTAL, BROADCASTED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, }; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; @@ -48,10 +38,19 @@ use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, }; -use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR}; +use storage_broker::{DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, parse_proto_ttid}; +use tokio::net::TcpListener; +use tokio::sync::broadcast; +use tokio::sync::broadcast::error::RecvError; +use tokio::time; +use tonic::body::{self, BoxBody, empty_body}; +use tonic::codegen::Service; +use tonic::{Code, Request, Response, Status}; +use tracing::*; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::sentry_init::init_sentry; +use utils::signals::ShutdownSignals; use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -743,11 +742,12 @@ async fn main() -> Result<(), Box> { #[cfg(test)] mod tests { - use super::*; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; + use super::*; + fn msg(timeline_id: Vec) -> Message { Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 3ac40f6e14..55d411f607 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,12 +1,11 @@ use std::time::Duration; -use tonic::codegen::StdError; -use tonic::transport::{ClientTlsConfig, Endpoint}; -use tonic::{transport::Channel, Status}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; -use proto::{ - broker_service_client::BrokerServiceClient, TenantTimelineId as ProtoTenantTimelineId, -}; +use proto::TenantTimelineId as ProtoTenantTimelineId; +use proto::broker_service_client::BrokerServiceClient; +use tonic::Status; +use tonic::codegen::StdError; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; // Code generated by protobuf. pub mod proto { @@ -20,11 +19,8 @@ pub mod proto { pub mod metrics; // Re-exports to avoid direct tonic dependency in user crates. -pub use tonic::Code; -pub use tonic::Request; -pub use tonic::Streaming; - pub use hyper::Uri; +pub use tonic::{Code, Request, Streaming}; pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index 1fd3dd5ad6..ecfb594eba 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; +use metrics::{IntCounter, IntGauge, register_int_counter, register_int_gauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 08c80bc141..b63ba154da 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_controller" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] @@ -18,6 +18,7 @@ anyhow.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true +cron.workspace = true fail.workspace = true futures.workspace = true hex.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index f8a2790769..7888b18aa7 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,6 +1,7 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; pub struct Client { base_url: Url, diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 226d4942e7..a630316f46 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, fmt::Debug, fmt::Display}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 5bc3c81f02..b602af362d 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::error::Error as _; use std::sync::Arc; -use std::{collections::HashMap, time::Duration}; +use std::time::Duration; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; @@ -12,11 +13,9 @@ use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShar use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; -use utils::{ - backoff::{self}, - id::{NodeId, TenantId}, -}; +use tracing::{Instrument, info_span}; +use utils::backoff::{self}; +use utils::id::{NodeId, TenantId}; use crate::service::Config; diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 8b7be88078..bd4b8ba38f 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -1,15 +1,14 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy}; -use utils::{id::NodeId, shard::TenantShardId}; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use crate::{ - background_node_operations::OperationError, node::Node, scheduler::Scheduler, - tenant_shard::TenantShard, -}; +use crate::background_node_operations::OperationError; +use crate::node::Node; +use crate::scheduler::Scheduler; +use crate::tenant_shard::TenantShard; pub(crate) struct TenantShardIterator { tenants_accessor: F, @@ -188,10 +187,8 @@ impl TenantShardDrain { mod tests { use std::sync::Arc; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::TenantShardIterator; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 52b6110667..56a331becd 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,24 +1,22 @@ -use futures::{stream::FuturesUnordered, StreamExt}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; +use pageserver_api::models::PageserverUtilization; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; -use std::{ - collections::HashMap, - fmt::Debug, - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio_util::sync::CancellationToken; - -use pageserver_api::{ - controller_api::{NodeAvailability, SkSchedulingPolicy}, - models::PageserverUtilization, -}; - use thiserror::Error; -use utils::{id::NodeId, logging::SecretString}; +use tokio_util::sync::CancellationToken; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{node::Node, safekeeper::Safekeeper}; +use crate::node::Node; +use crate::safekeeper::Safekeeper; struct HeartbeaterTask { receiver: tokio::sync::mpsc::UnboundedReceiver>, @@ -223,21 +221,21 @@ impl HeartBeat for HeartbeaterTask Some((*node_id, status)) } }); + } - loop { - let maybe_status = tokio::select! { - next = heartbeat_futs.next() => { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } @@ -363,21 +361,21 @@ impl HeartBeat for HeartbeaterTask { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 33b3d88c25..64f0be3c23 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,32 +1,27 @@ -use crate::http; -use crate::metrics::{ - HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, - METRICS_REGISTRY, -}; -use crate::persistence::SafekeeperUpsert; -use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use futures::Future; -use http_utils::{ - endpoint::{ - self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, - request_span, - }, - error::ApiError, - failpoints::failpoints_handler, - json::{json_request, json_response}, - request::{must_get_query_param, parse_query_param, parse_request_param}, - RequestExt, RouterBuilder, +use http_utils::endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, }; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{must_get_query_param, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; use hyper::header::CONTENT_TYPE; -use hyper::{Body, Request, Response}; -use hyper::{StatusCode, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, @@ -34,23 +29,21 @@ use pageserver_api::models::{ TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::{mgmt_api, BlockUnblock}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::{Duration, Instant}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use routerify::Middleware; use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; use utils::id::{NodeId, TenantId, TimelineId}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, - TenantShardMigrateRequest, +use crate::http; +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, METRICS_REGISTRY, + PageserverRequestLabelGroup, }; -use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; - -use routerify::Middleware; +use crate::persistence::SafekeeperUpsert; +use crate::reconciler::ReconcileError; +use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service}; /// State available to HTTP request handlers pub struct HttpState { @@ -531,9 +524,10 @@ async fn handle_tenant_timeline_download_heatmap_layers( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; let concurrency: Option = parse_query_param(&req, "concurrency")?; + let recurse = parse_query_param(&req, "recurse")?.unwrap_or(false); service - .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency, recurse) .await?; json_response(StatusCode::OK, ()) @@ -554,7 +548,7 @@ async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; let req = match maybe_forward(req).await { @@ -569,15 +563,28 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; - tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + tracing::info!( + "Proxying request for tenant {} ({})", + tenant_or_shard_id.tenant_id, + path + ); // Find the node that holds shard zero - let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() { + service + .tenant_shard0_node(tenant_or_shard_id.tenant_id) + .await? + } else { + ( + service.tenant_shard_node(tenant_or_shard_id).await?, + tenant_or_shard_id, + ) + }; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. let path = format!("{}", path); - let tenant_str = tenant_id.to_string(); + let tenant_str = tenant_or_shard_id.tenant_id.to_string(); let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); @@ -617,7 +624,7 @@ async fn handle_tenant_timeline_passthrough( // Transform 404 into 503 if we raced with a migration if resp.status() == reqwest::StatusCode::NOT_FOUND { // Look up node again: if we migrated it will be different - let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let new_node = service.tenant_shard_node(tenant_shard_id).await?; if new_node.get_id() != node.get_id() { // Rather than retry here, send the client a 503 to prompt a retry: this matches // the pageserver's use of 503, and all clients calling this API should retry on 503. @@ -1455,8 +1462,8 @@ pub fn prologue_leadership_status_check_middleware< }) } -fn prologue_metrics_middleware( -) -> Middleware { +fn prologue_metrics_middleware() +-> Middleware { Middleware::pre(move |req| async move { let meta = RequestMeta { method: req.method().clone(), @@ -1469,8 +1476,8 @@ fn prologue_metrics_middleware }) } -fn epilogue_metrics_middleware( -) -> Middleware { +fn epilogue_metrics_middleware() +-> Middleware { Middleware::post_with_info(move |resp, req_info| async move { let request_name = match req_info.context::() { Some(name) => name, @@ -1621,8 +1628,8 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( anyhow::anyhow!( - "Failed to parse leader uri for forwarding while in stepped down state: {err}" - ), + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), ))); } }; @@ -2155,8 +2162,23 @@ mod test { #[test] fn test_path_without_ids() { - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); - assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/"); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo" + ), + "/v1/tenant//timeline/" + ); } } diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index 2d8b674f86..6b0c16f0be 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -1,8 +1,7 @@ +use std::collections::HashMap; use std::fmt::Display; -use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; - -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, Instant}; use crate::service::RECONCILE_TIMEOUT; diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5fae8991ec..5e1d6f3ec9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -3,11 +3,9 @@ use std::sync::Arc; use hyper::Uri; use tokio_util::sync::CancellationToken; -use crate::{ - peer_client::{GlobalObservedState, PeerClient}, - persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, - service::Config, -}; +use crate::peer_client::{GlobalObservedState, PeerClient}; +use crate::persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}; +use crate::service::Config; /// Helper for storage controller leadership acquisition pub(crate) struct Leadership { @@ -91,7 +89,9 @@ impl Leadership { // Special case: if this is a brand new storage controller, migrations will not // have run at this point yet, and, hence, the controllers table does not exist. // Detect this case via the error string (diesel doesn't type it) and allow it. - tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + tracing::info!( + "Detected first storage controller start-up. Allowing missing controllers table ..." + ); return Ok(None); } } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 18922b9e05..04dd3bb3f6 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,26 +1,26 @@ -use anyhow::{anyhow, Context}; -use clap::Parser; -use hyper0::Uri; -use metrics::launch_timestamp::LaunchTimestamp; -use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, anyhow}; +use clap::Parser; +use hyper0::Uri; +use metrics::BuildInfo; +use metrics::launch_timestamp::LaunchTimestamp; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; - use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; @@ -34,7 +34,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; /// This adds roughly 3% overhead for allocations on average, which is acceptable considering /// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; #[derive(Parser)] @@ -115,10 +115,14 @@ struct Cli { #[arg(long)] neon_local_repo_dir: Option, - /// Chaos testing + /// Chaos testing: exercise tenant migrations #[arg(long)] chaos_interval: Option, + /// Chaos testing: exercise an immediate exit + #[arg(long)] + chaos_exit_crontab: Option, + // Maximum acceptable lag for the secondary location while draining // a pageserver #[arg(long)] @@ -293,8 +297,8 @@ async fn async_main() -> anyhow::Result<()> { // Production systems should always have secrets configured: if public_key was not set // then we would implicitly disable auth. anyhow::bail!( - "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" - ); + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); } StrictMode::Strict if args.compute_hook_url.is_none() => { // Production systems should always have a compute hook set, to prevent falling @@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> { let service = service.clone(); let cancel = CancellationToken::new(); let cancel_bg = cancel.clone(); + let chaos_exit_crontab = args.chaos_exit_crontab; ( tokio::task::spawn( async move { - let mut chaos_injector = ChaosInjector::new(service, interval.into()); + let mut chaos_injector = + ChaosInjector::new(service, interval.into(), chaos_exit_crontab); chaos_injector.run(cancel_bg).await } .instrument(tracing::info_span!("chaos_injector")), diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d67e0d130..f490edb68f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -7,17 +7,18 @@ //! //! The rest of the code defines label group types and deals with converting outer types to labels. //! +use std::sync::Mutex; + use bytes::Bytes; -use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use measured::label::LabelValue; +use measured::metric::histogram; +use measured::{FixedCardinalityLabel, MetricGroup}; use metrics::NeonMetrics; use once_cell::sync::Lazy; -use std::sync::Mutex; use strum::IntoEnumIterator; -use crate::{ - persistence::{DatabaseError, DatabaseOperation}, - service::LeadershipStatus, -}; +use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::service::LeadershipStatus; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 3762d13c10..bc7fe8802a 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,22 +1,22 @@ -use std::{str::FromStr, time::Duration}; +use std::str::FromStr; +use std::time::Duration; use anyhow::anyhow; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, - NodeSchedulingPolicy, TenantLocateResponseShard, - }, - shard::TenantShardId, +use pageserver_api::controller_api::{ + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + NodeSchedulingPolicy, TenantLocateResponseShard, }; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId}; +use utils::backoff; +use utils::id::NodeId; -use crate::{ - pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, -}; +use crate::pageserver_client::PageserverClient; +use crate::persistence::NodePersistence; +use crate::scheduler::MaySchedule; /// Represents the in-memory description of a Node. /// diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 645cbdfce1..d6127c355a 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,17 +1,13 @@ -use pageserver_api::{ - models::{ - detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, - PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, - TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, - TopTenantShardsResponse, - }, - shard::TenantShardId, -}; -use pageserver_client::{ - mgmt_api::{Client, Result}, - BlockUnblock, +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }; +use pageserver_api::shard::TenantShardId; +use pageserver_client::BlockUnblock; +use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; @@ -285,13 +281,19 @@ impl PageserverClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { measured_request!( "download_heatmap_layers", crate::metrics::Method::Post, &self.node_id_label, self.inner - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse + ) .await ) } diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index 1a15bae365..f3f275dee0 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,16 +1,17 @@ -use crate::tenant_shard::ObservedState; -use pageserver_api::shard::TenantShardId; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::error::Error as _; use std::time::Duration; -use tokio_util::sync::CancellationToken; use http_utils::error::HttpErrorBody; use hyper::Uri; +use pageserver_api::shard::TenantShardId; use reqwest::{StatusCode, Url}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use utils::backoff; +use crate::tenant_shard::ObservedState; + #[derive(Debug, Clone)] pub(crate) struct PeerClient { uri: Uri, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 459c11add9..2e80b48859 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2,45 +2,40 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; -use self::split_state::SplitState; +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::pg::Pg; use diesel::prelude::*; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::pooled_connection::bb8::Pool; -use diesel_async::pooled_connection::AsyncDieselConnectionManager; -use diesel_async::pooled_connection::ManagerConfig; -use diesel_async::AsyncPgConnection; -use diesel_async::RunQueryDsl; -use futures::future::BoxFuture; +use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use diesel_migrations::{EmbeddedMigrations, embed_migrations}; use futures::FutureExt; +use futures::future::BoxFuture; use itertools::Itertools; -use pageserver_api::controller_api::AvailabilityZone; -use pageserver_api::controller_api::MetadataHealthRecord; -use pageserver_api::controller_api::SafekeeperDescribeResponse; -use pageserver_api::controller_api::ShardSchedulingPolicy; -use pageserver_api::controller_api::SkSchedulingPolicy; -use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy, +}; use pageserver_api::models::TenantConfig; -use pageserver_api::shard::ShardConfigError; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; -use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; +use pageserver_api::shard::{ + ShardConfigError, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; use rustls::client::WebPkiServerVerifier; +use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; +use self::split_state::SplitState; use crate::metrics::{ DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, }; use crate::node::Node; - -use diesel_migrations::{embed_migrations, EmbeddedMigrations}; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// ## What do we store? @@ -479,8 +474,7 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::metadata_health; - use crate::schema::tenant_shards; + use crate::schema::{metadata_health, tenant_shards}; let now = chrono::Utc::now(); @@ -554,8 +548,7 @@ impl Persistence { &self, input_node_id: NodeId, ) -> DatabaseResult> { - use crate::schema::nodes::dsl::scheduling_policy; - use crate::schema::nodes::dsl::*; + use crate::schema::nodes::dsl::{scheduling_policy, *}; use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { @@ -1565,7 +1558,33 @@ pub(crate) struct SafekeeperPersistence { pub(crate) port: i32, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, - pub(crate) scheduling_policy: String, + pub(crate) scheduling_policy: SkSchedulingPolicyFromSql, +} + +/// Wrapper struct around [`SkSchedulingPolicy`] because both it and [`FromSql`] are from foreign crates, +/// and we don't want to make [`safekeeper_api`] depend on [`diesel`]. +#[derive(Serialize, Deserialize, FromSqlRow, Eq, PartialEq, Debug, Copy, Clone)] +pub(crate) struct SkSchedulingPolicyFromSql(pub(crate) SkSchedulingPolicy); + +impl From for SkSchedulingPolicyFromSql { + fn from(value: SkSchedulingPolicy) -> Self { + SkSchedulingPolicyFromSql(value) + } +} + +impl FromSql for SkSchedulingPolicyFromSql { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match SkSchedulingPolicy::from_str(s) { + Ok(policy) => Ok(SkSchedulingPolicyFromSql(policy)), + Err(e) => Err(format!("can't parse: {e}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for scheduling policy: {e}").into()), + } + } } impl SafekeeperPersistence { @@ -1581,14 +1600,10 @@ impl SafekeeperPersistence { port: upsert.port, http_port: upsert.http_port, availability_zone_id: upsert.availability_zone_id, - scheduling_policy: String::from(scheduling_policy), + scheduling_policy: SkSchedulingPolicyFromSql(scheduling_policy), } } pub(crate) fn as_describe_response(&self) -> Result { - let scheduling_policy = - SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { - DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) - })?; Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1597,7 +1612,7 @@ impl SafekeeperPersistence { port: self.port, http_port: self.http_port, availability_zone_id: self.availability_zone_id.clone(), - scheduling_policy, + scheduling_policy: self.scheduling_policy.0, }) } } diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs index bce1a75843..f83191038a 100644 --- a/storage_controller/src/persistence/split_state.rs +++ b/storage_controller/src/persistence/split_state.rs @@ -1,8 +1,8 @@ +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; use diesel::pg::{Pg, PgValue}; -use diesel::{ - deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, - sql_types::Int2, -}; +use diesel::serialize::ToSql; +use diesel::sql_types::Int2; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 4f0f170284..a327f6f50f 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,8 @@ -use crate::pageserver_client::PageserverClient; -use crate::persistence::Persistence; -use crate::{compute_hook, service}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ @@ -9,10 +11,6 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; -use std::borrow::Cow; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; use utils::generation::Generation; @@ -23,7 +21,10 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; +use crate::{compute_hook, service}; const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); @@ -511,7 +512,8 @@ impl Reconciler { } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); if total_runtime > total_download_timeout { - tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + tracing::warn!( + "Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, progress.layers_total, diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 53cd8a908b..9c7e6e0894 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -1,16 +1,16 @@ -use std::{str::FromStr, time::Duration}; +use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use reqwest::StatusCode; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId, logging::SecretString}; +use utils::backoff; +use utils::id::NodeId; +use utils::logging::SecretString; -use crate::{ - heartbeater::SafekeeperState, - persistence::{DatabaseError, SafekeeperPersistence}, - safekeeper_client::SafekeeperClient, -}; +use crate::heartbeater::SafekeeperState; +use crate::persistence::{DatabaseError, SafekeeperPersistence}; +use crate::safekeeper_client::SafekeeperClient; #[derive(Clone)] pub struct Safekeeper { @@ -25,7 +25,7 @@ pub struct Safekeeper { impl Safekeeper { pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { - let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); + let scheduling_policy = skp.scheduling_policy.0; Self { cancel, listen_http_addr: skp.host.clone(), @@ -54,7 +54,7 @@ impl Safekeeper { } pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { self.scheduling_policy = scheduling_policy; - self.skp.scheduling_policy = String::from(scheduling_policy); + self.skp.scheduling_policy = scheduling_policy.into(); } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index f234ab3429..fb5be092a0 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,13 +1,12 @@ -use crate::metrics::PageserverRequestLabelGroup; use safekeeper_api::models::{ PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use safekeeper_client::mgmt_api::{Client, Result}; -use utils::{ - id::{NodeId, TenantId, TimelineId}, - logging::SecretString, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use crate::metrics::PageserverRequestLabelGroup; /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 44936d018a..817cf04fe1 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,11 +1,17 @@ -use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; +use std::collections::HashMap; +use std::fmt::Debug; + use http_utils::error::ApiError; use itertools::Itertools; -use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; +use pageserver_api::controller_api::AvailabilityZone; +use pageserver_api::models::PageserverUtilization; use serde::Serialize; -use std::{collections::HashMap, fmt::Debug}; use utils::id::NodeId; +use crate::metrics::NodeLabelGroup; +use crate::node::Node; +use crate::tenant_shard::TenantShard; + /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] pub enum ScheduleError { @@ -775,10 +781,10 @@ impl Scheduler { if !matches!(context.mode, ScheduleMode::Speculative) { tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", - scores.iter().map(|i| i.node_id().0).collect::>(), - preferred_az, - ); + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", + scores.iter().map(|i| i.node_id().0).collect::>(), + preferred_az, + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -906,14 +912,14 @@ impl Scheduler { #[cfg(test)] pub(crate) mod test_utils { - use crate::node::Node; - use pageserver_api::{ - controller_api::{AvailabilityZone, NodeAvailability}, - models::utilization::test_utilization, - }; use std::collections::HashMap; + + use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability}; + use pageserver_api::models::utilization::test_utilization; use utils::id::NodeId; + use crate::node::Node; + /// Test helper: synthesize the requested number of nodes, all in active state. /// /// Node IDs start at one. @@ -951,17 +957,13 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{ - controller_api::NodeAvailability, models::utilization::test_utilization, - shard::ShardIdentity, - }; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::models::utilization::test_utilization; + use pageserver_api::shard::ShardIdentity; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::*; - use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b9c2711192..9ba9504718 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,112 +1,95 @@ pub mod chaos_injector; mod context_iterator; -use hyper::Uri; -use safekeeper_api::models::SafekeeperUtilization; -use std::{ - borrow::Cow, - cmp::Ordering, - collections::{BTreeMap, HashMap, HashSet}, - error::Error, - ops::Deref, - path::PathBuf, - str::FromStr, - sync::Arc, - time::{Duration, Instant}, -}; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::error::Error; +use std::ops::Deref; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; -use crate::{ - background_node_operations::{ - Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, - }, - compute_hook::{self, NotifyError}, - drain_utils::{self, TenantShardDrain, TenantShardIterator}, - heartbeater::SafekeeperState, - id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - leadership::Leadership, - metrics, - peer_client::GlobalObservedState, - persistence::{ - AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - ShardGenerationState, TenantFilter, - }, - reconciler::{ - ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, - ReconcilerPriority, - }, - safekeeper::Safekeeper, - scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, - tenant_shard::{ - MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, - ScheduleOptimization, ScheduleOptimizationAction, - }, -}; use anyhow::Context; +use context_iterator::TenantShardContextIterator; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; use diesel::result::DatabaseErrorKind; -use futures::{stream::FuturesUnordered, StreamExt}; -use itertools::Itertools; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, - NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest, - TimelineArchivalConfigRequest, TopTenantShardsRequest, - }, -}; -use reqwest::StatusCode; -use tracing::{instrument, Instrument}; - -use crate::pageserver_client::PageserverClient; +use futures::StreamExt; +use futures::stream::FuturesUnordered; use http_utils::error::ApiError; -use pageserver_api::{ - models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, - TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, - TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, - }, +use hyper::Uri; +use itertools::Itertools; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; -use pageserver_client::{mgmt_api, BlockUnblock}; -use tokio::sync::{mpsc::error::TrySendError, TryAcquireError}; +use pageserver_api::models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, + SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, + TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, + TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, +}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, +}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use reqwest::StatusCode; +use safekeeper_api::models::SafekeeperUtilization; +use tokio::sync::TryAcquireError; +use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; -use utils::{ - completion::Barrier, - failpoint_support, - generation::Generation, - id::{NodeId, TenantId, TimelineId}, - pausable_failpoint, - sync::gate::Gate, -}; +use tracing::{Instrument, instrument}; +use utils::completion::Barrier; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::sync::gate::Gate; +use utils::{failpoint_support, pausable_failpoint}; -use crate::{ - compute_hook::ComputeHook, - heartbeater::{Heartbeater, PageserverState}, - node::{AvailabilityTransition, Node}, - persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, - reconciler::attached_location_conf, - scheduler::Scheduler, - tenant_shard::{ - IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantShard, - }, +use crate::background_node_operations::{ + Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, +}; +use crate::compute_hook::{self, ComputeHook, NotifyError}; +use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator}; +use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState}; +use crate::id_lock_map::{ + IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock, +}; +use crate::leadership::Leadership; +use crate::metrics; +use crate::node::{AvailabilityTransition, Node}; +use crate::pageserver_client::PageserverClient; +use crate::peer_client::GlobalObservedState; +use crate::persistence::split_state::SplitState; +use crate::persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, + MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + TenantShardPersistence, +}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, + attached_location_conf, +}; +use crate::safekeeper::Safekeeper; +use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler}; +use crate::tenant_shard::{ + IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation, + ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter, + ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; - -use context_iterator::TenantShardContextIterator; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -787,7 +770,9 @@ impl Service { }); } - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + tracing::info!( + "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)" + ); } async fn initial_heartbeat_round<'a>( @@ -1182,7 +1167,9 @@ impl Service { let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { let Some(sk) = safekeepers.get_mut(&id) else { - tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"); + tracing::info!( + "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" + ); continue; }; sk.set_availability(state); @@ -1537,7 +1524,9 @@ impl Service { // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations // on different pageservers. - tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + tracing::warn!( + "Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled" + ); } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1867,7 +1856,7 @@ impl Service { } Ok(AttachHookResponse { - gen: attach_req + generation: attach_req .node_id .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) @@ -2039,7 +2028,7 @@ impl Service { let new_gen = *new_gen; response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: Some(new_gen.into().unwrap()), + r#gen: Some(new_gen.into().unwrap()), // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] // execution. If a pageserver is restarted during that process, then the reconcile pass will // fail, and start from scratch, so it doesn't make sense for us to try and preserve @@ -2076,7 +2065,7 @@ impl Service { response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: None, + r#gen: None, mode: LocationConfigMode::Secondary, }); @@ -2138,15 +2127,19 @@ impl Service { let locked = self.inner.read().unwrap(); for req_tenant in validate_req.tenants { if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.r#gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, - req_tenant.gen, + req_tenant.r#gen, tenant_shard.generation ); - in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + in_memory_result.push(( + req_tenant.id, + Generation::new(req_tenant.r#gen), + valid, + )); } else { // This is legal: for example during a shard split the pageserver may still // have deletions in its queue from the old pre-split shard, or after deletion @@ -2165,13 +2158,11 @@ impl Service { // in case of controller split-brain, where some other controller process might have incremented the generation. let db_generations = self .persistence - .shard_generations(in_memory_result.iter().filter_map(|i| { - if i.2 { - Some(&i.0) - } else { - None - } - })) + .shard_generations( + in_memory_result + .iter() + .filter_map(|i| if i.2 { Some(&i.0) } else { None }), + ) .await?; let db_generations = db_generations.into_iter().collect::>(); @@ -2323,7 +2314,9 @@ impl Service { // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, // if we see a unique key violation it means that the creation request's shard count matches the previous // creation's shard count. - tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + tracing::info!( + "Tenant shards already present in database, proceeding with idempotent creation..." + ); } // Any other database error is unexpected and a bug. Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), @@ -3004,7 +2997,7 @@ impl Service { None => { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), - )) + )); } } }; @@ -3071,7 +3064,9 @@ impl Service { }) .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); if let Some((node_id, _observed_location, mode)) = maybe_attached { - return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}" + ))); } } let scheduler = &mut locked.scheduler; @@ -3779,6 +3774,7 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<(), ApiError> { let _tenant_lock = trace_shared_lock( &self.tenant_op_locks, @@ -3816,7 +3812,12 @@ impl Service { targets, |tenant_shard_id, client| async move { client - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse, + ) .await }, 1, @@ -3944,7 +3945,9 @@ impl Service { // This can only happen if there is a split brain controller modifying the database. This should // never happen when testing, and if it happens in production we can only log the issue. debug_assert!(false); - tracing::error!("Shard {shard_id} not found in generation state! Is another rogue controller running?"); + tracing::error!( + "Shard {shard_id} not found in generation state! Is another rogue controller running?" + ); continue; }; let (generation, generation_pageserver) = generation; @@ -3953,13 +3956,17 @@ impl Service { // This is legitimate only in a very narrow window where the shard was only just configured into // Attached mode after being created in Secondary or Detached mode, and it has had its generation // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver). - tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?"); + tracing::warn!( + "Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?" + ); } } else { // This should never happen: a shard with no generation is only permitted when it was created in some state // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory) debug_assert!(false); - tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!"); + tracing::error!( + "Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!" + ); continue; } } @@ -4157,16 +4164,14 @@ impl Service { }).await? } - /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, ) -> Result<(Node, TenantShardId), ApiError> { - // Look up in-memory state and maybe use the node from there. - { + let tenant_shard_id = { let locked = self.inner.read().unwrap(); - let Some((tenant_shard_id, shard)) = locked + let Some((tenant_shard_id, _shard)) = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) .next() @@ -4176,6 +4181,29 @@ impl Service { )); }; + *tenant_shard_id + }; + + self.tenant_shard_node(tenant_shard_id) + .await + .map(|node| (node, tenant_shard_id)) + } + + /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this + /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound) + pub(crate) async fn tenant_shard_node( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + // Look up in-memory state and maybe use the node from there. + { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard {tenant_shard_id} not found").into(), + )); + }; + let Some(intent_node_id) = shard.intent.get_attached() else { tracing::warn!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -4196,7 +4224,7 @@ impl Service { "Shard refers to nonexistent node" ))); }; - return Ok((node.clone(), *tenant_shard_id)); + return Ok(node.clone()); } }; @@ -4204,29 +4232,34 @@ impl Service { // generation state: this will reflect the progress of any ongoing migration. // Note that it is not guaranteed to _stay_ here, our caller must still handle // the case where they call through to the pageserver and get a 404. - let db_result = self.persistence.tenant_generations(tenant_id).await?; + let db_result = self + .persistence + .tenant_generations(tenant_shard_id.tenant_id) + .await?; let Some(ShardGenerationState { - tenant_shard_id, + tenant_shard_id: _, generation: _, generation_pageserver: Some(node_id), - }) = db_result.first() + }) = db_result + .into_iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) else { // This can happen if we raced with a tenant deletion or a shard split. On a retry // the caller will either succeed (shard split case), get a proper 404 (deletion case), // or a conflict response (case where tenant was detached in background) return Err(ApiError::ResourceUnavailable( - "Shard {} not found in database, or is not attached".into(), + format!("Shard {tenant_shard_id} not found in database, or is not attached").into(), )); }; let locked = self.inner.read().unwrap(); - let Some(node) = locked.nodes.get(node_id) else { + let Some(node) = locked.nodes.get(&node_id) else { // This should never happen return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard refers to nonexistent node" ))); }; - Ok((node.clone(), *tenant_shard_id)) + Ok(node.clone()) } pub(crate) fn tenant_locate( @@ -4492,13 +4525,17 @@ impl Service { // if the original attachment location is offline. if let Some(node_id) = shard.intent.get_attached() { if !nodes.get(node_id).unwrap().is_available() { - tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.demote_attached(scheduler, *node_id); } } for node_id in shard.intent.get_secondary().clone() { if !nodes.get(&node_id).unwrap().is_available() { - tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"); + tracing::info!( + "Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}" + ); shard.intent.remove_secondary(scheduler, node_id); } } @@ -4526,7 +4563,9 @@ impl Service { // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have // removed child shards from our in-memory state and database, the reconciliation will implicitly remove // them from the node. - tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + tracing::warn!( + "Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated." + ); continue; } @@ -4971,7 +5010,10 @@ impl Service { // applies the new stripe size to the children. let mut shard_ident = shard_ident.unwrap(); if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { - return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", + shard_ident.stripe_size + ))); } shard_ident.stripe_size = new_stripe_size; @@ -5226,8 +5268,11 @@ impl Service { ) .await { - tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", - child_id, child_ps); + tracing::warn!( + "Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, + child_ps + ); failed_notifications.push(child_id); } } @@ -5283,9 +5328,13 @@ impl Service { match shard.policy { PlacementPolicy::Attached(n) => { // If our new attached node was a secondary, it no longer should be. - shard.intent.remove_secondary(scheduler, migrate_req.node_id); + shard + .intent + .remove_secondary(scheduler, migrate_req.node_id); - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + shard + .intent + .set_attached(scheduler, Some(migrate_req.node_id)); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { @@ -5306,7 +5355,7 @@ impl Service { PlacementPolicy::Detached => { return Err(ApiError::BadRequest(anyhow::anyhow!( "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))) + ))); } } @@ -5367,7 +5416,9 @@ impl Service { shard.intent ); } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { - tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary"); + tracing::info!( + "Migrating secondary to {node}: already attached where we were asked to create a secondary" + ); } else { let old_secondaries = shard.intent.get_secondary().clone(); for secondary in old_secondaries { @@ -5880,7 +5931,7 @@ impl Service { return Err(ApiError::InternalServerError(anyhow::anyhow!( "{} attached as primary+secondary on the same node", tid - ))) + ))); } (true, false) => Some(false), (false, true) => Some(true), @@ -6923,12 +6974,16 @@ impl Service { // Check that maybe_optimizable doesn't disagree with the actual optimization functions. // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't // panic in prod if we hit this, or spend cycles on it in prod. - assert!(shard - .optimize_attachment(scheduler, &schedule_context) - .is_none()); - assert!(shard - .optimize_secondary(scheduler, &schedule_context) - .is_none()); + assert!( + shard + .optimize_attachment(scheduler, &schedule_context) + .is_none() + ); + assert!( + shard + .optimize_secondary(scheduler, &schedule_context) + .is_none() + ); } continue; } @@ -6984,7 +7039,9 @@ impl Service { } Some(node) => { if !node.is_available() { - tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + tracing::info!( + "Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable" + ); } else { // Accumulate optimizations that require fetching secondary status, so that we can execute these // remote API requests concurrently. @@ -7030,7 +7087,9 @@ impl Service { { match secondary_status { Err(e) => { - tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}" + ); } Ok(progress) => { // We require secondary locations to have less than 10GiB of downloads pending before we will use @@ -7043,7 +7102,9 @@ impl Service { || progress.bytes_total - progress.bytes_downloaded > DOWNLOAD_FRESHNESS_THRESHOLD { - tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" + ); #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { @@ -7149,14 +7210,18 @@ impl Service { { Some(Err(e)) => { tracing::info!( - "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" - ); + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); } None => { - tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"); + tracing::info!( + "Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}" + ); } Some(Ok(progress)) => { - tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"); + tracing::info!( + "Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}" + ); } } } @@ -7241,7 +7306,9 @@ impl Service { // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't // want to block the background reconcile loop on this. - tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + tracing::info!( + "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" + ); let this = self.clone(); tokio::spawn( diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index aa0ee0df5a..2ff68d7037 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -1,8 +1,6 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, - time::Duration, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; @@ -16,29 +14,80 @@ use super::{Node, Scheduler, Service, TenantShard}; pub struct ChaosInjector { service: Arc, interval: Duration, + chaos_exit_crontab: Option, +} + +fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result { + use chrono::Utc; + let next = cron.upcoming(Utc).next().unwrap(); + let duration = (next - Utc::now()).to_std()?; + Ok(tokio::time::sleep(duration)) +} + +async fn maybe_sleep(sleep: Option) -> Option<()> { + if let Some(sleep) = sleep { + sleep.await; + Some(()) + } else { + None + } } impl ChaosInjector { - pub fn new(service: Arc, interval: Duration) -> Self { - Self { service, interval } + pub fn new( + service: Arc, + interval: Duration, + chaos_exit_crontab: Option, + ) -> Self { + Self { + service, + interval, + chaos_exit_crontab, + } } pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - - loop { - tokio::select! { - _ = interval.tick() => {} - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; + let cron_interval = { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } } + } else { + None } - - self.inject_chaos().await; - - tracing::info!("Chaos iteration..."); + }; + enum ChaosEvent { + ShuffleTenant, + ForceKill, } + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } + } + + tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be @@ -95,6 +144,11 @@ impl ChaosInjector { ); } + async fn force_kill(&mut self) { + tracing::warn!("Injecting chaos: force kill"); + std::process::exit(1); + } + async fn inject_chaos(&mut self) { // Pick some shards to interfere with let batch_size = 128; @@ -120,12 +174,19 @@ impl ChaosInjector { let mut victims = Vec::with_capacity(batch_size); if out_of_home_az.len() >= batch_size { - tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len()); + tracing::info!( + "Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", + out_of_home_az.len() + ); out_of_home_az.shuffle(&mut thread_rng()); victims.extend(out_of_home_az.into_iter().take(batch_size)); } else { - tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len())); + tracing::info!( + "Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", + out_of_home_az.len(), + std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()) + ); victims.extend(out_of_home_az); in_home_az.shuffle(&mut thread_rng()); diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index dd6913e988..c4784e5e36 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -54,17 +54,16 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, str::FromStr}; + use std::collections::BTreeMap; + use std::str::FromStr; use pageserver_api::controller_api::PlacementPolicy; use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - scheduler::test_utils::make_test_nodes, service::Scheduler, - tenant_shard::tests::make_test_tenant_with_id, - }; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; + use crate::service::Scheduler; + use crate::tenant_shard::tests::make_test_tenant_with_id; #[test] fn test_context_iterator() { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 56a36dc2df..34fd244023 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1,50 +1,39 @@ -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; -use crate::{ - metrics::{ - self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, - }, - persistence::TenantShardPersistence, - reconciler::{ReconcileUnits, ReconcilerConfig}, - scheduler::{ - AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, - RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag, - }, - service::ReconcileResultRequest, -}; use futures::future::{self, Either}; use itertools::Itertools; use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; -use pageserver_api::{ - models::{LocationConfig, LocationConfigMode, TenantConfig}, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models::{LocationConfig, LocationConfigMode, TenantConfig}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::{instrument, Instrument}; -use utils::{ - generation::Generation, - id::NodeId, - seqwait::{SeqWait, SeqWaitError}, - shard::ShardCount, - sync::gate::GateGuard, -}; +use tracing::{Instrument, instrument}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::seqwait::{SeqWait, SeqWaitError}; +use utils::shard::ShardCount; +use utils::sync::gate::GateGuard; -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{split_state::SplitState, Persistence}, - reconciler::{ - attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, - }, - scheduler::{ScheduleError, Scheduler}, - service, Sequence, +use crate::compute_hook::ComputeHook; +use crate::metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, }; +use crate::node::Node; +use crate::persistence::split_state::SplitState; +use crate::persistence::{Persistence, TenantShardPersistence}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, Reconciler, ReconcilerConfig, TargetState, + attached_location_conf, secondary_location_conf, +}; +use crate::scheduler::{ + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, ScheduleError, Scheduler, SecondaryShardTag, ShardTag, +}; +use crate::service::ReconcileResultRequest; +use crate::{Sequence, service}; /// Serialization helper fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result @@ -835,7 +824,9 @@ impl TenantShard { let current_score = current_score.for_optimization(); if candidate_score < current_score { - tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"); + tracing::info!( + "Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})" + ); Some(true) } else { // The candidate node is no better than our current location, so don't migrate @@ -1005,7 +996,7 @@ impl TenantShard { // most cases, even if some nodes are offline or have scheduling=pause set. debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this - // logic presumes we are in a mode where we want secondaries to be in non-home AZ + // logic presumes we are in a mode where we want secondaries to be in non-home AZ if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; let is_available = secondary_scores @@ -1029,7 +1020,8 @@ impl TenantShard { } // Fall through: we didn't identify one to remove. This ought to be rare. - tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + tracing::warn!( + "Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", self.intent.get_secondary() ); } else { @@ -1798,8 +1790,8 @@ impl TenantShard { let conf = observed.conf.as_ref()?; match (conf.generation, conf.mode) { - (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { - Some((*node_id, gen)) + (Some(gen_), AttachedMulti | AttachedSingle | AttachedStale) => { + Some((*node_id, gen_)) } _ => None, } @@ -1807,7 +1799,7 @@ impl TenantShard { .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { lhs_gen.cmp(rhs_gen).reverse() }) - .map(|(node_id, gen)| (node_id, Generation::new(gen))) + .map(|(node_id, gen_)| (node_id, Generation::new(gen_))) .collect() } @@ -1839,7 +1831,10 @@ impl TenantShard { (Some(crnt), Some(new)) if crnt_gen > new_gen => { tracing::warn!( "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})", - node_id, loc, crnt, new + node_id, + loc, + crnt, + new ); self.observed @@ -1896,18 +1891,17 @@ impl Drop for TenantShard { #[cfg(test)] pub(crate) mod tests { - use std::{cell::RefCell, rc::Rc}; + use std::cell::RefCell; + use std::rc::Rc; - use pageserver_api::{ - controller_api::NodeAvailability, - shard::{ShardCount, ShardNumber}, - }; - use rand::{rngs::StdRng, SeedableRng}; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::shard::{ShardCount, ShardNumber}; + use rand::SeedableRng; + use rand::rngs::StdRng; use utils::id::TenantId; - use crate::scheduler::test_utils::make_test_nodes; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); @@ -2085,16 +2079,20 @@ pub(crate) mod tests { // In pause mode, schedule() shouldn't do anything tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(!tenant_shard.intent.all_pageservers().is_empty()); tenant_shard.intent.clear(&mut scheduler); @@ -2621,9 +2619,11 @@ pub(crate) mod tests { ); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { - assert!(shard - .schedule(&mut scheduler, &mut schedule_context) - .is_ok()); + assert!( + shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok() + ); } // Initial: attached locations land in the tenant's home AZ. diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 609f3bf009..7f6544b894 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_scrubber" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b42709868b..f0ba632fd4 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,12 +1,19 @@ use std::collections::{HashMap, HashSet}; use std::time::SystemTime; +use futures_util::StreamExt; use itertools::Itertools; +use pageserver::tenant::IndexPart; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; +use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -15,14 +22,7 @@ use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, -}; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries}; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -329,11 +329,11 @@ pub(crate) enum BlobDataParseResult { pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? - Some((layer_filename, gen)) if gen.len() == 8 => { + Some((layer_filename, gen_)) if gen_.len() == 8 => { let layer = layer_filename.parse::()?; - let gen = - Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; - Ok((layer, gen)) + let gen_ = + Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?; + Ok((layer, gen_)) } _ => Ok((name.parse::()?, Generation::none())), } @@ -423,9 +423,9 @@ async fn list_timeline_blobs_impl( tracing::info!("initdb archive preserved {key}"); } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { - Ok((new_layer, gen)) => { - tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); - s3_layers.insert((new_layer, gen)); + Ok((new_layer, gen_)) => { + tracing::debug!("Parsed layer key: {new_layer} {gen_:?}"); + s3_layers.insert((new_layer, gen_)); } Err(e) => { tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}"); @@ -465,7 +465,7 @@ async fn list_timeline_blobs_impl( .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) { - Some((key, gen)) => (Some::(key.to_owned()), gen), + Some((key, gen_)) => (Some::(key.to_owned()), gen_), None => { // Legacy/missing case: one or zero index parts, which did not have a generation (index_part_keys.pop(), Generation::none()) @@ -521,7 +521,7 @@ async fn list_timeline_blobs_impl( }, unused_index_keys: index_part_keys, unknown_keys, - })) + })); } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" @@ -631,7 +631,7 @@ pub(crate) async fn list_tenant_manifests( .map(|(g, obj)| (*g, obj.clone())) .unwrap(); - manifests.retain(|(gen, _obj)| gen != &latest_generation); + manifests.retain(|(gen_, _obj)| gen_ != &latest_generation); let manifest_bytes = match download_object_with_retries(remote_client, &latest_listing_object.key).await { diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index b1dfe3a53f..5cf286c662 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -3,11 +3,9 @@ use std::error::Error as _; use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; - -use reqwest::{header, Client, StatusCode, Url}; +use reqwest::{Client, StatusCode, Url, header}; use serde::Deserialize; use tokio::sync::Semaphore; - use tokio_util::sync::CancellationToken; use utils::backoff; use utils::id::{TenantId, TimelineId}; diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 95d3af1453..efb05fb55e 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -5,10 +5,9 @@ use pageserver::tenant::storage_layer::LayerName; use remote_storage::ListingMode; use serde::{Deserialize, Serialize}; -use crate::{ - checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, - stream_objects_with_retries, BucketConfig, NodeKind, -}; +use crate::checks::parse_layer_object_name; +use crate::metadata_stream::stream_tenants; +use crate::{BucketConfig, NodeKind, init_remote, stream_objects_with_retries}; #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] enum LargeObjectKind { diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index a4e5107e3d..e4f69a1669 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -3,11 +3,9 @@ //! Garbage means S3 objects which are either not referenced by any metadata, //! or are referenced by a control plane tenant/timeline in a deleted state. -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Context; use futures_util::TryStreamExt; @@ -16,13 +14,14 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::TenantId}; +use utils::backoff; +use utils::id::TenantId; +use crate::cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}; use crate::{ - cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, + BucketConfig, ConsoleConfig, MAX_RETRIES, NodeKind, TenantShardTimelineId, TraversingDepth, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -259,14 +258,21 @@ async fn find_garbage_inner( .await?; if let Some(object) = tenant_objects.keys.first() { if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console and contains one object: {}", + object.key + ); } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran" + ); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -295,9 +301,13 @@ async fn find_garbage_inner( } if any_non_initdb { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb" + ); } else { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } @@ -546,7 +556,9 @@ pub async fn purge_garbage( .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) && garbage_list.active_timeline_count == 0 { - anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + anyhow::bail!( + "Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines" + ); } let filtered_items = garbage_list diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 224235098c..34e43fcc0b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -17,15 +17,14 @@ use std::time::{Duration, SystemTime}; use anyhow::Context; use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; use aws_sdk_s3::config::Region; use aws_sdk_s3::error::DisplayErrorContext; -use aws_sdk_s3::Client; - use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use futures::{Stream, StreamExt}; -use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, @@ -38,7 +37,8 @@ use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::{EnvFilter, fmt}; use utils::fs_ext; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -411,10 +411,10 @@ async fn init_remote( let default_prefix = default_prefix_in_bucket(node_kind).to_string(); match &mut storage_config.0.storage { - RemoteStorageKind::AwsS3(ref mut config) => { + RemoteStorageKind::AwsS3(config) => { config.prefix_in_bucket.get_or_insert(default_prefix); } - RemoteStorageKind::AzureContainer(ref mut config) => { + RemoteStorageKind::AzureContainer(config) => { config.prefix_in_container.get_or_insert(default_prefix); } RemoteStorageKind::LocalFs { .. } => (), diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fa6ee90b66..fb2ab02565 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,24 +1,20 @@ -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use camino::Utf8PathBuf; +use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; use reqwest::{Method, Url}; use storage_controller_client::control_api; -use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; +use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; -use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; +use storage_scrubber::scan_safekeeper_metadata::{DatabaseOrList, scan_safekeeper_metadata}; use storage_scrubber::tenant_snapshot::SnapshotDownloader; -use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ - init_logging, pageserver_physical_gc::pageserver_physical_gc, - scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, - TraversingDepth, + BucketConfig, ConsoleConfig, ControllerClientConfig, NodeKind, TraversingDepth, + find_large_objects, init_logging, }; - -use clap::{Parser, Subcommand}; use utils::id::TenantId; - use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -173,15 +169,23 @@ async fn main() -> anyhow::Result<()> { if let NodeKind::Safekeeper = node_kind { let db_or_list = match (timeline_lsns, dump_db_connstr) { (Some(timeline_lsns), _) => { - let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + let timeline_lsns = serde_json::from_str(&timeline_lsns) + .context("parsing timeline_lsns")?; DatabaseOrList::List(timeline_lsns) } (None, Some(dump_db_connstr)) => { - let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let dump_db_table = dump_db_table + .ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); - DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + DatabaseOrList::Database { + tenant_ids, + connstr: dump_db_connstr, + table: dump_db_table, + } } - (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + (None, None) => anyhow::bail!( + "neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`" + ), }; let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { @@ -371,7 +375,9 @@ pub async fn scan_pageserver_metadata_cmd( exit_code: bool, ) -> anyhow::Result<()> { if controller_client.is_none() && post_to_storcon { - return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + return Err(anyhow!( + "Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run" + )); } match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await { Err(e) => { diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 47447d681c..af2407856d 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,17 +1,17 @@ use std::str::FromStr; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use async_stream::{stream, try_stream}; use futures::StreamExt; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; +use utils::id::{TenantId, TimelineId}; use crate::{ - list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, - TenantShardTimelineId, + RootTarget, S3Target, TenantShardTimelineId, list_objects_with_retries, + stream_objects_with_retries, }; -use pageserver_api::shard::TenantShardId; -use utils::id::{TenantId, TimelineId}; /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes pub fn stream_tenants<'a>( diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 063c6bcfb9..c956b1abbc 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,22 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{ - list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, - RemoteTenantManifestInfo, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use async_stream::try_stream; use futures::future::Either; use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest; use pageserver::tenant::remote_timeline_client::{ parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, }; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -25,11 +19,18 @@ use reqwest::Method; use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; +use crate::checks::{ + BlobDataParseResult, ListTenantManifestResult, RemoteTenantManifestInfo, list_tenant_manifests, + list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, MAX_RETRIES, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index a31fb5b242..ba75f25984 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,22 @@ use std::collections::{HashMap, HashSet}; -use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, - RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::id::TenantId; use utils::shard::ShardCount; +use crate::checks::{ + BlobDataParseResult, RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 0a4d4266a0..f10d758097 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,23 +1,24 @@ -use std::{collections::HashSet, str::FromStr, sync::Arc}; +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use postgres_ffi::{XLogFileName, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName}; use remote_storage::GenericRemoteStorage; use rustls::crypto::ring; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use crate::cloud_admin_api::CloudAdminApiClient; +use crate::metadata_stream::stream_listing; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 60e79fb859..e17409c20e 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -1,25 +1,26 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; -use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; -use crate::{ - download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, - TenantShardTimelineId, -}; use anyhow::Context; use async_stream::stream; use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, S3Config}; use utils::generation::Generation; use utils::id::TenantId; +use crate::checks::{BlobDataParseResult, RemoteTimelineBlobData, list_timeline_blobs}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, download_object_to_file_s3, + init_remote, init_remote_s3, +}; + pub struct SnapshotDownloader { s3_client: Arc, s3_root: RootTarget, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d282971b1..73c8406237 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2469,12 +2469,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + def download_heatmap_layers( + self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, recurse: bool | None = None + ): + url = ( + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers" + ) + if recurse is not None: + url = url + f"?recurse={str(recurse).lower()}" + response = self.request( "POST", - f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + url, headers=self.headers(TokenScope.ADMIN), ) + response.raise_for_status() def __enter__(self) -> Self: @@ -4523,33 +4532,6 @@ class Safekeeper(LogUtils): for na in not_allowed: assert not self.log_contains(na) - def append_logical_message( - self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any] - ) -> dict[str, Any]: - """ - Send JSON_CTRL query to append LogicalMessage to WAL and modify - safekeeper state. It will construct LogicalMessage from provided - prefix and message, and then will write it to WAL. - """ - - # "replication=0" hacks psycopg not to send additional queries - # on startup, see https://github.com/psycopg/psycopg2/pull/482 - token = self.env.auth_keys.generate_tenant_token(tenant_id) - connstr = f"host=localhost port={self.port.pg} password={token} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" - - with closing(psycopg2.connect(connstr)) as conn: - # server doesn't support transactions - conn.autocommit = True - with conn.cursor() as cur: - request_json = json.dumps(request) - log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") - cur.execute("JSON_CTRL " + request_json) - all = cur.fetchall() - log.info(f"JSON_CTRL response: {all[0][0]}") - res = json.loads(all[0][0]) - assert isinstance(res, dict) - return res - def http_client( self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 748ac0d569..4fce558840 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -94,7 +94,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + ".*WARN deletion backend:.* storage controller upcall failed, will retry.*error sending request.*", + # Can happen when the pageserver starts faster than the storage controller + ".*WARN init_tenant_mgr:.* storage controller upcall failed, will retry.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 493ce7334e..7038d87aba 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -273,10 +273,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_exclude( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> dict[str, Any]: + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", + data=to.to_json(), + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def membership_switch( self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration ) -> TimelineMembershipSwitchResponse: - res = self.post( + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", data=to.to_json(), ) diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py new file mode 100644 index 0000000000..061467bbad --- /dev/null +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -0,0 +1,221 @@ +import math # Add this import +import time +import traceback +from pathlib import Path + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_api import NeonAPI, connection_parameters_to_env +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + +vacuum_times_sql = """ +SELECT + relname AS table_name, + last_autovacuum, + last_autoanalyze +FROM + pg_stat_user_tables where relname = 'pgbench_accounts' +ORDER BY + last_autovacuum DESC, last_autoanalyze DESC +""" + + +def insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series(6800001, {6800001 + rows_to_insert - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is None + + +def insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series({6800001 + rows_to_insert}, {6800001 + rows_to_insert * 2 - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is not None + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(60 * 60) +def test_cumulative_statistics_persistence( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Verifies that the cumulative statistics are correctly persisted across restarts. + Cumulative statistics are important to persist across restarts because they are used + when auto-vacuum an auto-analyze trigger conditions are met. + The test performs the following steps: + - Seed a new project using pgbench + - insert tuples that by itself are not enough to trigger auto-vacuum + - suspend the endpoint + - resume the endpoint + - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are + - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension + """ + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + endpoint_id = project["endpoints"][0]["id"] + region_id = project["project"]["region_id"] + log.info(f"Created project {project_id} with endpoint {endpoint_id} in region {region_id}") + error_occurred = False + try: + connstr = project["connection_uris"][0]["connection_uri"] + env = connection_parameters_to_env(project["connection_uris"][0]["connection_parameters"]) + # seed about 1 GiB of data into pgbench_accounts + pg_bin.run_capture(["pgbench", "-i", "-s68"], env=env) + + # assert rows in pgbench_accounts is 6800000 rows + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # assert rows in pgbench_accounts is 6800000 rows + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 # n_tup_ins + assert row[1] == 1 # vacuum_count + assert row[2] == 1 # analyze_count + + # retrieve some GUCs (postgres settings) relevant to autovacuum + cur.execute( + "SELECT setting::int AS autovacuum_naptime FROM pg_settings WHERE name = 'autovacuum_naptime'" + ) + autovacuum_naptime = cur.fetchall()[0][0] + assert autovacuum_naptime < 300 and autovacuum_naptime > 0 + cur.execute( + "SELECT setting::float AS autovacuum_vacuum_insert_scale_factor FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_scale_factor'" + ) + autovacuum_vacuum_insert_scale_factor = cur.fetchall()[0][0] + assert ( + autovacuum_vacuum_insert_scale_factor > 0.05 + and autovacuum_vacuum_insert_scale_factor < 1.0 + ) + cur.execute( + "SELECT setting::int AS autovacuum_vacuum_insert_threshold FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_threshold'" + ) + autovacuum_vacuum_insert_threshold = cur.fetchall()[0][0] + cur.execute( + "SELECT setting::int AS pgstat_file_size_limit FROM pg_settings WHERE name = 'neon.pgstat_file_size_limit'" + ) + pgstat_file_size_limit = cur.fetchall()[0][0] + assert pgstat_file_size_limit > 10 * 1024 # at least 10 MB + + # insert rows that by itself are not enough to trigger auto-vacuum + # vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples + # https://www.postgresql.org/docs/17/routine-vacuuming.html + rows_to_insert = int( + math.ceil( + autovacuum_vacuum_insert_threshold / 2 + + row_count * autovacuum_vacuum_insert_scale_factor * 0.6 + ) + ) + + log.info( + f"autovacuum_vacuum_insert_scale_factor: {autovacuum_vacuum_insert_scale_factor}, autovacuum_vacuum_insert_threshold: {autovacuum_vacuum_insert_threshold}, row_count: {row_count}" + ) + log.info( + f"Inserting {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + + insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime + ) + + conn.close() + + # suspend the endpoint + log.info(f"Suspending endpoint {endpoint_id}") + neon_api.suspend_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + time.sleep(60) # give some time in between suspend and resume + + # resume the endpoint + log.info(f"Starting endpoint {endpoint_id}") + neon_api.start_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # insert additional rows that by itself are not enough to trigger auto-vacuum, but in combination + # with the previous rows inserted before the suspension are + log.info( + f"Inserting another {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime + ) + + # verify estimatednumber of tuples in pgbench_accounts is within 6800000 + inserted rows +- 2 % + cur.execute( + "select reltuples::bigint from pg_class where relkind = 'r' and relname = 'pgbench_accounts'" + ) + reltuples = cur.fetchall()[0][0] + assert reltuples > 6800000 + rows_to_insert * 2 * 0.98 + assert reltuples < 6800000 + rows_to_insert * 2 * 1.02 + + # verify exact number of pgbench_accounts rows (computed row_count) + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + rows_to_insert * 2 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 + rows_to_insert * 2 + assert row[1] == 1 + assert row[2] == 1 + + conn.close() + + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 0ee0efe8b9..2570c55f6c 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -2,8 +2,10 @@ import os from pathlib import Path import pytest +from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder def get_num_relations(default: int = 1000) -> list[int]: @@ -64,3 +66,52 @@ def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): env.pg_bin.run_capture( ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] ) + + +def test_perf_simple_many_relations_reldir_v2( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Test creating many relations in a single database. + """ + env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": "true"}) + ep = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=1000MB", + "max_locks_per_transaction=16384", + ], + ) + + n = 100000 + step = 5000 + # Create many relations + log.info(f"Creating {n} relations...") + begin = 0 + with zenbenchmark.record_duration("create_first_relation"): + ep.safe_psql("CREATE TABLE IF NOT EXISTS table_begin (id SERIAL PRIMARY KEY, data TEXT)") + with zenbenchmark.record_duration("create_many_relations"): + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break + with zenbenchmark.record_duration("create_last_relation"): + ep.safe_psql(f"CREATE TABLE IF NOT EXISTS table_{begin} (id SERIAL PRIMARY KEY, data TEXT)") diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c0c9537421..bfc5cb174e 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -7,6 +7,7 @@ import psycopg2.errors import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) @@ -80,3 +81,193 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() + + +def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder): + """ + Test timeouts in waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=["autovacuum = off"], + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Print the backend PID so that it can be compared with the logs easily + cur.execute("SELECT pg_backend_pid()") + row = cur.fetchone() + assert row is not None + log.info(f"running test workload in backend PID {row[0]}") + + def run_workload(duration: float): + end_time = time.time() + duration + times_executed = 0 + while time.time() < end_time: + if random.random() < 0.5: + cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + else: + cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + cur.fetchall() + times_executed += 1 + log.info(f"Workload executed {times_executed} times") + assert times_executed > 0 + + ## Test short connection hiccups + ## + ## This is to exercise the logging timeout. + log.info("running workload with log timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(20) + + # check that the message was logged + assert endpoint.log_contains("no response received from pageserver for .* s, still waiting") + assert endpoint.log_contains("received response from pageserver after .* s") + + ## Test connections that are hung for longer + ## + ## This exercises the disconnect timeout. We'll disconnect and + ## reconnect after 500 ms. + log.info("running workload with disconnect timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '250ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(15) + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() + + +def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder): + """ + Test statement_timeout while waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + + # Make sure the shared_buffers and LFC are tiny, to ensure the queries + # hit the storage. Disable autovacuum to make the test more deterministic. + config_lines = [ + "shared_buffers='512kB'", + "autovacuum = off", + ] + if USE_LFC: + config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"] + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=config_lines, + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Disable parallel query. Parallel workers open their own pageserver connections, + # which messes up the test logic. + cur.execute("SET max_parallel_workers_per_gather=0") + cur.execute("SET effective_io_concurrency=0") + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + ## Run a query until the compute->pageserver connection hits the failpoint and + ## get stuck. This tests that the statement_timeout is obeyed while waiting on a + ## GetPage request. + log.info("running workload with statement_timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'") + cur.execute("SET statement_timeout='10s'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)")) + + start_time = time.time() + with pytest.raises(psycopg2.errors.QueryCanceled): + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement timeout reached") + end_time = time.time() + # Verify that the statement_timeout canceled the query before + # neon.pageserver_response_disconnect_timeout expired + assert end_time - start_time < 40 + times_canceled = 1 + + # Should not have disconnected yet + assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # Clear the failpoint. This doesn't affect the connection that already hit it. It + # will keep waiting. But subsequent connections will work normally. + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off")) + + # If we keep retrying, we should eventually succeed. (This tests that the + # neon.pageserver_response_disconnect_timeout is not reset on query + # cancellation.) + while times_canceled < 10: + try: + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement succeeded") + break + except psycopg2.errors.QueryCanceled: + log.info("Statement timed out, retrying") + times_canceled += 1 + assert times_canceled > 1 and times_canceled < 10 + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index c091cd0869..0df88e14c2 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -319,8 +319,12 @@ def test_pageserver_gc_compaction_idempotent( }, ) wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + # Ensure all data are uploaded so that the duplicated layer gets into index_part.json + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_flushed=True) if compaction_mode == "after_restart": env.pageserver.restart(True) + workload.validate(env.pageserver.id) ps_http.timeline_gc( tenant_id, timeline_id, None ) # Force refresh gc info to have gc_cutoff generated @@ -335,6 +339,7 @@ def test_pageserver_gc_compaction_idempotent( "sub_compaction_max_job_size_mb": 16, }, ) + workload.validate(env.pageserver.id) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) @@ -466,6 +471,59 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): ps_http.timeline_gc(tenant_id, timeline_id, None) +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 * 16}", + "lsn_lease_length": "0s", + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": "16", + "gc_compaction_ratio_percent": "50", + # Do not generate image layers with create_image_layers + "image_layer_creation_check_threshold": "100", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=True) + wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index ae2d171058..c8458b963e 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient -def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): +def check_tenant( + env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int +): tenant_id, timeline_id = env.create_tenant() - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines) # we rely upon autocommit after each statement res_1 = endpoint.safe_psql_many( queries=[ @@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): @pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) -def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_normal_work( + neon_env_builder: NeonEnvBuilder, + num_timelines: int, + num_safekeepers: int, + safekeeper_proto_version: int, +): """ Basic test: * create new tenant with a timeline @@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s pageserver_http = env.pageserver.http_client() for _ in range(num_timelines): - check_tenant(env, pageserver_http) + check_tenant(env, pageserver_http, safekeeper_proto_version) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a9b897b741..ab0f00db1c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -938,9 +938,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 - # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: + # Simulate large data by making layer downloads artifically slow ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + # Make the initial logical size calculation lie. Otherwise it on demand downloads + # layers and makes accounting difficult. + ps.http_client().configure_failpoints(("skip-logical-size-calculation", "return")) def timeline_heatmap(tlid): assert env.pageserver_remote_storage is not None @@ -952,21 +955,6 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): raise RuntimeError(f"No heatmap for timeline: {tlid}") - # Upload a heatmap, so that secondaries have something to download - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = timeline_heatmap(timeline_id) - - # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. - # However, it pulls the heatmap, which will be important later. - http_client = env.storage_controller.pageserver_api() - (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) - assert status == 202 - assert progress["heatmap_mtime"] is not None - assert progress["layers_downloaded"] > 0 - assert progress["bytes_downloaded"] > 0 - assert progress["layers_total"] > progress["layers_downloaded"] - assert progress["bytes_total"] > progress["bytes_downloaded"] - env.storage_controller.allowed_errors.extend( [ ".*Timed out.*downloading layers.*", @@ -975,6 +963,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Use a custom configuration that gives up earlier than usual. # We can't hydrate everything anyway because of the failpoints. + # Implicitly, this also uploads a heatmap from the current attached location. config = StorageControllerMigrationConfig( secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" ) @@ -988,22 +977,17 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_heatmap_upload(tenant_id) heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["layers"]) > 0 + local_layers = ps_secondary.list_layers(tenant_id, timeline_id) + # We download 1 layer per second and give up within 5 seconds. + assert len(local_layers) < 10 after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) - assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count - log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") env.storage_controller.download_heatmap_layers( TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - # Now simulate the case where a child timeline is archived, parent layers - # are evicted and the child is unarchived. When the child is unarchived, - # itself and the parent update their heatmaps to contain layers needed by the - # child. One can warm up the timeline hierarchy since the heatmaps are ready. - def all_layers_downloaded(expected_layer_count: int): local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) @@ -1011,8 +995,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): assert local_layers_count >= expected_layer_count wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) - ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + # Read everything and make sure that we're not downloading anything extra. + # All hot layers should be available locally now. before = ( ps_secondary.http_client() .get_metrics() @@ -1030,6 +1015,11 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.stop() assert before == after + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + def check_archival_state(state: TimelineArchivalState, tline): timelines = ( timeline["timeline_id"] @@ -1064,6 +1054,6 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): assert expected_locally > 0 env.storage_controller.download_heatmap_layers( - TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True ) wait_until(lambda: all_layers_downloaded(expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 411888efbc..6a76ad5ca8 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -332,8 +332,10 @@ def test_sql_regress( @skip_in_debug_build("only run with release build") +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_tx_abort_with_many_relations( neon_env_builder: NeonEnvBuilder, + reldir_type: str, ): """ This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres @@ -342,7 +344,11 @@ def test_tx_abort_with_many_relations( Reproducer for https://github.com/neondatabase/neon/issues/9505 """ - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false", + } + ) ep = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, @@ -354,48 +360,65 @@ def test_tx_abort_with_many_relations( # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - n = 4000 + if reldir_type == "v1": + n = 4000 + step = 4000 + else: + n = 20000 + step = 5000 def create(): # Create many relations log.info(f"Creating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; - END LOOP; - END $$; - """, - "COMMIT", - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break def truncate(): # Truncate relations, then roll back the transaction containing the truncations log.info(f"Truncating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'TRUNCATE ' || table_name ; - END LOOP; - END $$; - """, - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'TRUNCATE ' || table_name ; + END LOOP; + END $$; + """, + ] + ) + begin = end + if begin >= n: + break def rollback_and_wait(): log.info(f"Rolling back after truncating {n} relations...") diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py index c31e5ef7f8..bf9b982e14 100644 --- a/test_runner/regress/test_pgstat.py +++ b/test_runner/regress/test_pgstat.py @@ -13,7 +13,7 @@ def test_pgstat(neon_simple_env: NeonEnv): n = 10000 endpoint = env.endpoints.create_start( - "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"] + "main", config_lines=["neon.pgstat_file_size_limit=100kB", "autovacuum=off"] ) con = endpoint.connect() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c5045fe4a4..0a05189bfb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -811,60 +811,6 @@ class ProposerPostgres(PgProtocol): self.pg_bin.run(args) -# insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor, -): - # We don't really need the full environment for this test, just the - # safekeepers would be enough. - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - tenant_id = TenantId.generate() - timeline_id = TimelineId.generate() - - # write config for proposer - pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres( - pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() - ) - pg.create_dir_config(env.get_safekeeper_connstrs()) - - # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = Lsn("0/16B9188") - begin_lsn = epoch_start_lsn - - # append and commit WAL - lsn_after_append = [] - for i in range(3): - res = env.safekeepers[i].append_logical_message( - tenant_id, - timeline_id, - { - "lm_prefix": "prefix", - "lm_message": "message", - "set_commit_lsn": True, - "send_proposer_elected": True, - "term": 2, - "begin_lsn": int(begin_lsn), - "epoch_start_lsn": int(epoch_start_lsn), - "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(env.pg_version) * 10000, - }, - ) - lsn = Lsn(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn) - log.info(f"safekeeper[{i}] lsn after append: {lsn}") - - # run sync safekeepers - lsn_after_sync = pg.sync_safekeepers() - log.info(f"lsn after sync = {lsn_after_sync}") - - assert all(lsn_after_sync == lsn for lsn in lsn_after_append) - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled @@ -1740,7 +1686,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): +def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() @@ -2269,13 +2215,21 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() + # These are expected after timeline deletion on safekeepers. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline sk = env.safekeepers[0] http_cli = sk.http_client() - sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_1 = SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. @@ -2303,19 +2257,28 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): log.info(f"conf after restart: {after_restart}") assert after_restart.generation == 4 - # Switch into disjoint conf. - non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + # Switch into non joint conf of which sk is not a member, must fail. + non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) + + # Switch into good non joint conf. + non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 - assert resp.current_conf.generation == 5 + assert resp.current_conf.generation == 6 - # Switch request to lower conf should be ignored. - lower_conf = Configuration(generation=3, members=[], new_members=None) - resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) - log.info(f"lower switch resp: {resp}") - assert resp.previous_conf.generation == 5 - assert resp.current_conf.generation == 5 + # Switch request to lower conf should be rejected. + lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + + # Now, exclude sk from the membership, timeline should be deleted. + excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.timeline_status(tenant_id, timeline_id) # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 936c774657..56539a0a08 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -539,13 +539,16 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) -async def run_wal_truncation(env: NeonEnv): +async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline (sk1, sk2, sk3) = env.safekeepers - ep = env.endpoints.create_start("main") + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + ep = env.endpoints.create_start("main", config_lines=config_lines) ep.safe_psql("create table t (key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -572,6 +575,7 @@ async def run_wal_truncation(env: NeonEnv): sk2.start() ep = env.endpoints.create_start( "main", + config_lines=config_lines, ) ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") @@ -590,11 +594,13 @@ async def run_wal_truncation(env: NeonEnv): # Simple deterministic test creating tail of WAL on safekeeper which is # truncated when majority without this sk elects walproposer starting earlier. -def test_wal_truncation(neon_env_builder: NeonEnvBuilder): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - asyncio.run(run_wal_truncation(env)) + asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) async def run_segment_init_failure(env: NeonEnv): diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 6ff5044377..9b118b1cff 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7 +Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 261ed10e9b..799e7a08dd 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d +Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 59b2fe851f..517b8dc244 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f +Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e diff --git a/vendor/revisions.json b/vendor/revisions.json index f85cec3a0b..8dde46a01e 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,15 +1,15 @@ { "v17": [ "17.4", - "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f" + "517b8dc244abf3e56f0089849e464af76f70b94e" ], "v16": [ "16.8", - "261ed10e9b8c8dda01ad7aefb18e944e30aa161d" + "799e7a08dd171aa06a7395dd326f4243aaeb9f93" ], "v15": [ "15.12", - "6ff50443773b69749e16da6db9d4f4b19064b4b7" + "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8" ], "v14": [ "14.17",