diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1e6c2d0aa2..39a30d9a39 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -32,3 +32,6 @@ config-variables: - NEON_DEV_AWS_ACCOUNT_ID - NEON_PROD_AWS_ACCOUNT_ID - AWS_ECR_REGION + - BENCHMARK_LARGE_OLTP_PROJECTID + - SLACK_ON_CALL_DEVPROD_STREAM + - SLACK_RUST_CHANNEL_ID diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 9f752d5a89..71dd6f3af2 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -84,7 +84,13 @@ runs: --header "Authorization: Bearer ${API_KEY}" ) - role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name') + role_name=$(echo "$roles" | jq --raw-output ' + (.roles | map(select(.protected == false))) as $roles | + if any($roles[]; .name == "neondb_owner") + then "neondb_owner" + else $roles[0].name + end + ') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: API_HOST: ${{ inputs.api_host }} @@ -107,13 +113,13 @@ runs: ) if [ -z "${reset_password}" ]; then - sleep 1 + sleep $i continue fi password=$(echo $reset_password | jq --raw-output '.role.password') if [ "${password}" == "null" ]; then - sleep 1 + sleep $i # increasing backoff continue fi diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 122fe48b68..fa6f882161 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,11 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v16' + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string benchmark_durations: description: 'benchmark durations JSON' required: false @@ -59,7 +64,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} @@ -112,6 +117,7 @@ runs: ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} + SANITIZERS: ${{ inputs.sanitizers }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report diff --git a/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py similarity index 100% rename from scripts/generate_image_maps.py rename to .github/scripts/generate_image_maps.py diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq new file mode 100644 index 0000000000..b0b00bce18 --- /dev/null +++ b/.github/scripts/previous-releases.jq @@ -0,0 +1,25 @@ +# Expects response from https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases as input, +# with tag names `release` for storage, `release-compute` for compute and `release-proxy` for proxy releases. +# Extract only the `tag_name` field from each release object +[ .[].tag_name ] + +# Transform each tag name into a structured object using regex capture +| reduce map( + capture("^(?release(-(?proxy|compute))?-(?\\d+))$") + | { + component: (.component // "storage"), # Default to "storage" if no component is specified + version: (.version | tonumber), # Convert the version number to an integer + full: .full # Store the full tag name for final output + } + )[] as $entry # Loop over the transformed list + +# Accumulate the latest (highest-numbered) version for each component +({}; + .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) + +# Convert the resulting object into an array of formatted strings +| to_entries +| map("\(.key)=\(.value.full)") + +# Output each string separately +| .[] diff --git a/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py similarity index 100% rename from scripts/push_with_image_map.py rename to .github/scripts/push_with_image_map.py diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 30fde127b0..6a2070424a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -280,7 +280,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact path: /tmp/neon aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -347,6 +347,7 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} + sanitizers: ${{ inputs.sanitizers }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports @@ -359,7 +360,6 @@ jobs: PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} - SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml new file mode 100644 index 0000000000..a3fc125648 --- /dev/null +++ b/.github/workflows/_meta.yml @@ -0,0 +1,107 @@ +name: Generate run metadata +on: + workflow_call: + inputs: + github-event-name: + type: string + required: true + outputs: + build-tag: + description: "Tag for the current workflow run" + value: ${{ jobs.tags.outputs.build-tag }} + previous-storage-release: + description: "Tag of the last storage release" + value: ${{ jobs.tags.outputs.storage }} + previous-proxy-release: + description: "Tag of the last proxy release" + value: ${{ jobs.tags.outputs.proxy }} + previous-compute-release: + description: "Tag of the last compute release" + value: ${{ jobs.tags.outputs.compute }} + run-kind: + description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`" + value: ${{ jobs.tags.outputs.run-kind }} + +permissions: {} + +jobs: + tags: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + compute: ${{ steps.previous-releases.outputs.compute }} + proxy: ${{ steps.previous-releases.outputs.proxy }} + storage: ${{ steps.previous-releases.outputs.storage }} + run-kind: ${{ steps.run-kind.outputs.run-kind }} + permissions: + contents: read + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get run kind + id: run-kind + env: + RUN_KIND: >- + ${{ + false + || (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main' + || (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release' + || (inputs.github-event-name == 'push' && github.ref_name == 'release-proxy') && 'proxy-release' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release') && 'storage-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr' + || (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr' + || (inputs.github-event-name == 'pull_request') && 'pr' + || (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch' + || 'unknown' + }} + run: | + echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT + + - name: Get build tag + id: build-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + RUN_KIND: ${{ steps.run-kind.outputs.run-kind }} + run: | + case $RUN_KIND in + push-main) + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + storage-release) + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + proxy-release) + echo "tag=release-proxy-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + compute-release) + echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + ;; + pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + workflow-dispatch) + echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT + ;; + *) + echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!" + exit 1 + esac + + - name: Get the previous release-tags + id: previous-releases + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases" \ + | jq -f .github/scripts/previous-releases.jq -r \ + | tee -a "${GITHUB_OUTPUT}" diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 403d078988..2dab665f40 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -51,7 +51,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/push_with_image_map.py + sparse-checkout: .github/scripts/push_with_image_map.py sparse-checkout-cone-mode: false - name: Print image-map @@ -99,6 +99,6 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries - run: python scripts/push_with_image_map.py + run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index b36ac46f35..ff7db02e42 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -140,6 +140,9 @@ jobs: --ignore test_runner/performance/test_logical_replication.py --ignore test_runner/performance/test_physical_replication.py --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py + --ignore test_runner/performance/test_cumulative_statistics_persistence.py + --ignore test_runner/performance/test_perf_many_relations.py + --ignore test_runner/performance/test_perf_oltp_large_tenant.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -171,6 +174,61 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + cumstats-test: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 17 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Verify that cumulative statistics are preserved + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_cumulative_statistics_persistence.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 3600 + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: @@ -398,6 +456,9 @@ jobs: runs-on: ${{ matrix.runner }} container: image: ${{ matrix.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init # Increase timeout to 8h, default timeout is 6h diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1b706b3f16..197b83fac4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -65,38 +65,11 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} filters: .github/file-filters.yaml - tag: + meta: needs: [ check-permissions ] - runs-on: [ self-hosted, small ] - container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - run: | - echo run:$GITHUB_RUN_ID - echo ref:$GITHUB_REF_NAME - echo rev:$(git rev-list --count HEAD) - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" - echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT - fi - shell: bash - id: build-tag + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ github.event_name }} build-build-tools-image: needs: [ check-permissions ] @@ -199,7 +172,7 @@ jobs: secrets: inherit build-and-test-locally: - needs: [ tag, build-build-tools-image ] + needs: [ meta, build-build-tools-image ] strategy: fail-fast: false matrix: @@ -213,7 +186,7 @@ jobs: with: arch: ${{ matrix.arch }} build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - build-tag: ${{ needs.tag.outputs.build-tag }} + build-tag: ${{ needs.meta.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. @@ -497,13 +470,24 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] + # Depends on jobs that can get skipped + if: >- + ${{ + ( + !github.event.pull_request.draft + || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + || contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) + ) && !failure() && !cancelled() + }} + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml + with: + github-event-name: ${{ github.event_name }} secrets: inherit neon-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] @@ -539,7 +523,7 @@ jobs: build-args: | ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm DEBIAN_VERSION=bookworm provenance: false @@ -549,10 +533,11 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: - needs: [ neon-image-arch, tag ] + needs: [ neon-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -567,13 +552,14 @@ jobs: - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, tag ] + needs: [ check-permissions, build-build-tools-image, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -631,7 +617,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -641,7 +627,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -651,7 +637,7 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} + BUILD_TAG=${{ needs.meta.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false @@ -661,10 +647,11 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: - needs: [ compute-node-image-arch, tag ] + needs: [ compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -692,27 +679,28 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - vm-compute-node-image: - needs: [ check-permissions, tag, compute-node-image ] - runs-on: [ self-hosted, large ] + vm-compute-node-image-arch: + needs: [ check-permissions, meta, compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} strategy: fail-fast: false matrix: + arch: [ amd64, arm64 ] version: - # see the comment for `compute-node-image-arch` job - pg: v14 debian: bullseye - pg: v15 @@ -722,14 +710,14 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.37.1 + VM_BUILDER_VERSION: v0.42.2 steps: - uses: actions/checkout@v4 - name: Downloading vm-builder run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder chmod +x vm-builder - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 @@ -742,22 +730,50 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ + -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} + + vm-compute-node-image: + needs: [ vm-compute-node-image-arch, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + runs-on: ubuntu-22.04 + strategy: + matrix: + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + - pg: v15 + - pg: v16 + - pg: v17 + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ + neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 + test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image ] + needs: [ check-permissions, meta, neon-image, compute-node-image ] + # Depends on jobs that can get skipped + if: "!failure() && !cancelled()" strategy: fail-fast: false matrix: @@ -775,17 +791,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Get the last compute release tag - id: get-last-compute-release-tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - run: | - tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ - -H "Accept: application/vnd.github+json" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "/repos/${{ github.repository }}/releases") - echo tag=${tag} >> ${GITHUB_OUTPUT} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -795,8 +800,9 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -813,7 +819,24 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 env: - TAG: ${{needs.tag.outputs.build-tag}} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} + TEST_EXTENSIONS_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && 'latest' + || needs.meta.outputs.build-tag + }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh @@ -825,10 +848,17 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 - if: ${{ needs.tag.outputs.build-tag == github.run_id }} + if: ${{ contains(fromJSON('["pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} env: - NEWTAG: ${{ needs.tag.outputs.build-tag }} - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TAG: >- + ${{ + false + || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag + || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release + }} + TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }} + NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }} + OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }} run: ./docker-compose/test_extensions_upgrade.sh - name: Print logs and clean up @@ -838,7 +868,7 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down generate-image-maps: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 outputs: neon-dev: ${{ steps.generate.outputs.neon-dev }} @@ -848,14 +878,14 @@ jobs: steps: - uses: actions/checkout@v4 with: - sparse-checkout: scripts/generate_image_maps.py + sparse-checkout: .github/scripts/generate_image_maps.py sparse-checkout-cone-mode: false - name: Generate Image Maps id: generate - run: python scripts/generate_image_maps.py + run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BUILD_TAG: "${{ needs.meta.outputs.build-tag }}" BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -864,7 +894,8 @@ jobs: AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: - needs: [ generate-image-maps, neon-image ] + needs: [ meta, generate-image-maps, neon-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -881,7 +912,8 @@ jobs: secrets: inherit push-compute-image-dev: - needs: [ generate-image-maps, vm-compute-node-image ] + needs: [ meta, generate-image-maps, vm-compute-node-image ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -898,8 +930,9 @@ jobs: secrets: inherit push-neon-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, neon-image, test-images ] + needs: [ meta, generate-image-maps, neon-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && contains(fromJSON('["storage-release", "proxy-release"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -916,8 +949,9 @@ jobs: secrets: inherit push-compute-image-prod: - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ generate-image-maps, vm-compute-node-image, test-images ] + needs: [ meta, generate-image-maps, vm-compute-node-image, test-images ] + # Depends on jobs that can get skipped + if: ${{ !failure() && !cancelled() && needs.meta.outputs.run-kind == 'compute-release' }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -936,18 +970,19 @@ jobs: # This is a bit of a special case so we're not using a generated image map. add-latest-tag-to-neon-extensions-test-image: if: github.ref_name == 'main' - needs: [ tag, compute-node-image ] + needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] } secrets: inherit trigger-custom-extensions-build-and-wait: - needs: [ check-permissions, tag ] + needs: [ check-permissions, meta ] + if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -982,7 +1017,7 @@ jobs: \"ci_job_name\": \"build-and-upload-extensions\", \"commit_hash\": \"$COMMIT_SHA\", \"remote_repo\": \"${{ github.repository }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.meta.outputs.build-tag }}\", \"remote_branch_name\": \"${{ github.ref_name }}\" } }" @@ -1026,9 +1061,9 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` + if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1039,108 +1074,103 @@ jobs: - uses: actions/checkout@v4 - name: Create git tag and GitHub release - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + if: ${{ contains(fromJSON('["storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) }} uses: actions/github-script@v7 + env: + TAG: "${{ needs.meta.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + PREVIOUS_RELEASE: >- + ${{ + false + || needs.meta.outputs.run-kind == 'storage-release' && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.run-kind == 'proxy-release' && needs.meta.outputs.previous-proxy-release + || needs.meta.outputs.run-kind == 'compute-release' && needs.meta.outputs.previous-compute-release + || 'unknown' + }} with: retries: 5 script: | - const tag = "${{ needs.tag.outputs.build-tag }}"; - const branch = "${{ github.ref_name }}"; + const { TAG, BRANCH, PREVIOUS_RELEASE } = process.env try { const existingRef = await github.rest.git.getRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `tags/${tag}`, + ref: `tags/${TAG}`, }); if (existingRef.data.object.sha !== context.sha) { - throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); + throw new Error(`Tag ${TAG} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); } - console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`); + console.log(`Tag ${TAG} already exists and points to ${context.sha} as expected.`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Tag ${tag} does not exist. Creating it...`); + console.log(`Tag ${TAG} does not exist. Creating it...`); await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: `refs/tags/${tag}`, + ref: `refs/tags/${TAG}`, sha: context.sha, }); - console.log(`Tag ${tag} created successfully.`); + console.log(`Tag ${TAG} created successfully.`); } try { const existingRelease = await github.rest.repos.getReleaseByTag({ owner: context.repo.owner, repo: context.repo.repo, - tag: tag, + tag: TAG, }); - console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`); + console.log(`Release for tag ${TAG} already exists (ID: ${existingRelease.data.id}).`); } catch (error) { if (error.status !== 404) { throw error; } - console.log(`Release for tag ${tag} does not exist. Creating it...`); + console.log(`Release for tag ${TAG} does not exist. Creating it...`); // Find the PR number using the commit SHA const pullRequests = await github.rest.pulls.list({ owner: context.repo.owner, repo: context.repo.repo, state: 'closed', - base: branch, + base: BRANCH, }); const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); const prNumber = pr ? pr.number : null; - // Find the previous release on the branch - const releases = await github.rest.repos.listReleases({ - owner: context.repo.owner, - repo: context.repo.repo, - per_page: 100, - }); - - const branchReleases = releases.data - .filter((release) => { - const regex = new RegExp(`^${branch}-\\d+$`); - return regex.test(release.tag_name) && !release.draft && !release.prerelease; - }) - .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); - - const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; - const releaseNotes = [ prNumber ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` : 'Release PR not found.', - previousTag - ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` - : `No previous release found on branch ${branch}.`, + `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${PREVIOUS_RELEASE}...${TAG}.` ].join('\n\n'); await github.rest.repos.createRelease({ owner: context.repo.owner, repo: context.repo.repo, - tag_name: tag, + tag_name: TAG, body: releaseNotes, }); - console.log(`Release for tag ${tag} created successfully.`); + console.log(`Release for tag ${TAG} created successfully.`); } - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + RUN_KIND: ${{ needs.meta.outputs.run-kind }} run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + case ${RUN_KIND} in + push-main) + gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.meta.outputs.build-tag}} -f deployPreprodRegion=false + ;; + storage-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ @@ -1148,7 +1178,7 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \ @@ -1156,8 +1186,9 @@ jobs: -f deployStorageBroker=true \ -f deployStorageController=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + proxy-release) gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ @@ -1165,7 +1196,7 @@ jobs: -f deployStorageBroker=false \ -f deployStorageController=false \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f dockerTag=${{needs.meta.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ @@ -1175,13 +1206,16 @@ jobs: -f deployProxyScram=true \ -f deployProxyAuthBroker=true \ -f branch=main \ - -f dockerTag=${{needs.tag.outputs.build-tag}} - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" + -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + compute-release) + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.meta.outputs.build-tag}} + ;; + *) + echo "RUN_KIND (value '${RUN_KIND}') is not set to either 'push-main', 'storage-release', 'proxy-release' or 'compute-release'" exit 1 - fi + ;; + esac notify-storage-release-deploy-failure: needs: [ deploy ] @@ -1197,7 +1231,7 @@ jobs: payload: | channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} text: | - ๐Ÿ”ด @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + ๐Ÿ”ด : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: @@ -1206,7 +1240,7 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read - # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 @@ -1296,7 +1330,8 @@ jobs: pin-build-tools-image: needs: [ build-build-tools-image, test-images, build-and-test-locally ] - if: github.ref_name == 'main' + # `!failure() && !cancelled()` is required because the job (transitively) depends on jobs that can be skipped + if: github.ref_name == 'main' && !failure() && !cancelled() uses: ./.github/workflows/pin-build-tools-image.yml with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} @@ -1315,6 +1350,7 @@ jobs: # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: + - meta - build-and-test-locally - check-codestyle-python - check-codestyle-rust @@ -1338,7 +1374,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.push-compute-image-dev.result == 'skipped' - || needs.push-neon-image-dev.result == 'skipped' + || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.test-images.result == 'skipped' - || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' + || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml index 433b377c32..222f7e9787 100644 --- a/.github/workflows/cargo-deny.yml +++ b/.github/workflows/cargo-deny.yml @@ -7,7 +7,7 @@ on: required: false type: string schedule: - - cron: '0 0 * * *' + - cron: '0 10 * * *' jobs: cargo-deny: @@ -50,8 +50,9 @@ jobs: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_CICD_CHANNEL_ID }} + channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} text: | Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> - Pinging @oncall-devprod. + Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help. + Pinging . diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 09d6acd325..606e1c0862 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -38,6 +38,9 @@ jobs: runs-on: us-east-2 container: image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml index 71c5158ef6..f2376306dc 100644 --- a/.github/workflows/force-test-extensions-upgrade.yml +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -52,8 +52,9 @@ jobs: - name: Test extension upgrade timeout-minutes: 20 env: - NEWTAG: latest - OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + NEW_COMPUTE_TAG: latest + OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} PG_VERSION: ${{ matrix.pg-version }} FORCE_ALL_UPGRADE_TESTS: true run: ./docker-compose/test_extensions_upgrade.sh diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml new file mode 100644 index 0000000000..f33e11cd08 --- /dev/null +++ b/.github/workflows/large_oltp_benchmark.yml @@ -0,0 +1,147 @@ +name: large oltp benchmark + +on: + # uncomment to run on push for debugging your PR + push: + branches: [ bodobolero/synthetic_oltp_workload ] + + schedule: + # * is a special character in YAML so you have to quote this string + # โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ minute (0 - 59) + # โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ hour (0 - 23) + # โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the month (1 - 31) + # โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ month (1 - 12 or JAN-DEC) + # โ”‚ โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the week (0 - 6 or SUN-SAT) + - cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: large-oltp-bench-workflow + cancel-in-progress: true + +jobs: + oltp: + strategy: + fail-fast: false # allow other variants to continue even if one fails + matrix: + include: + - target: new_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + - target: reuse_branch + custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h + TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + PG_VERSION: 16 # pre-determined by pre-determined project + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }} + PLATFORM: ${{ matrix.target }} + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + # Increase timeout to 8h, default timeout is 6h + timeout-minutes: 480 + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Create Neon Branch for large tenant + if: ${{ matrix.target == 'new_branch' }} + id: create-neon-branch-oltp-target + uses: ./.github/actions/neon-branch-create + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Benchmark pgbench with custom-scripts + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Delete Neon Branch for large tenant + if: ${{ always() && matrix.target == 'new_branch' }} + uses: ./.github/actions/neon-branch-delete + with: + project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }} + branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Periodic large oltp perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index f077e04d1c..90318747b3 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,7 +71,7 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index af877029e4..f854bf3212 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central on: schedule: # * is a special character in YAML so you have to quote this string - # โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ minute (0 - 59) - # โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ hour (0 - 23) - # โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the month (1 - 31) - # โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ month (1 - 12 or JAN-DEC) - # โ”‚ โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the week (0 - 6 or SUN-SAT) - - cron: '0 18 * * *' # Runs at 6 PM UTC every day + # โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ minute (0 - 59) + # โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ hour (0 - 23) + # โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the month (1 - 31) + # โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ month (1 - 12 or JAN-DEC) + # โ”‚ โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ day of the week (0 - 6 or SUN-SAT) + - cron: '0 */3 * * *' # Runs every 3 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -78,8 +78,10 @@ jobs: run: | if [ -z "$INPUT_COMMIT_HASH" ]; then echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi - name: Start Bench with run_id @@ -89,7 +91,7 @@ jobs: -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" - name: Poll Test Status id: poll_step diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index be6a7a7901..a30da35502 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -5,6 +5,10 @@ on: types: - ready_for_review workflow_call: + inputs: + github-event-name: + type: string + required: true defaults: run: @@ -19,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name }} + github-event-name: ${{ inputs.github-event-name || github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -35,46 +39,29 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" - tag: - needs: [ check-permissions ] - runs-on: ubuntu-22.04 - outputs: - build-tag: ${{ steps.build-tag.outputs.tag }} - - steps: - # Need `fetch-depth: 0` to count the number of commits in the branch - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get build tag - env: - GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} - CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then - echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') - echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT - fi - id: build-tag + meta: + uses: ./.github/workflows/_meta.yml + with: + github-event-name: ${{ inputs.github-event-name || github.event_name }} trigger-e2e-tests: - needs: [ tag ] + needs: [ meta ] runs-on: ubuntu-22.04 env: EVENT_ACTION: ${{ github.event.action }} GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} - TAG: ${{ needs.tag.outputs.build-tag }} + TAG: >- + ${{ + contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-storage-release + || needs.meta.outputs.build-tag + }} + COMPUTE_TAG: >- + ${{ + contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.previous-compute-release + || needs.meta.outputs.build-tag + }} steps: - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely @@ -157,6 +144,6 @@ jobs: --raw-field "commit_hash=$COMMIT_SHA" \ --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ --raw-field "storage_image_tag=${TAG}" \ - --raw-field "compute_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${COMPUTE_TAG}" \ --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/CODEOWNERS b/CODEOWNERS index 71b5e65f94..2a112d9728 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,8 +1,8 @@ # Autoscaling /libs/vm_monitor/ @neondatabase/autoscaling -# DevProd -/.github/ @neondatabase/developer-productivity +# DevProd & PerfCorr +/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness # Compute /pgxn/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 47552174d2..778ff19fec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -783,6 +783,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-extra" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b" +dependencies = [ + "axum", + "axum-core", + "bytes", + "futures-util", + "headers", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "serde", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + [[package]] name = "azure_core" version = "0.21.0" @@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "base64" -version = "0.21.1" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "base64" @@ -984,9 +1006,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.70.1" +version = "0.71.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" dependencies = [ "bitflags 2.8.0", "cexpr", @@ -997,7 +1019,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 2.1.1", "shlex", "syn 2.0.90", ] @@ -1105,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.30" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ "jobserver", "libc", @@ -1305,6 +1327,7 @@ dependencies = [ "aws-sdk-s3", "aws-smithy-types", "axum", + "axum-extra", "base64 0.13.1", "bytes", "camino", @@ -1316,6 +1339,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -1342,7 +1366,9 @@ dependencies = [ "tokio-util", "tower 0.5.2", "tower-http", + "tower-otel", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", @@ -2295,7 +2321,7 @@ name = "framed-websockets" version = "0.1.0" source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "bytemuck", "bytes", "futures-core", @@ -2408,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" @@ -2513,6 +2539,27 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "governor" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0" +dependencies = [ + "cfg-if", + "dashmap 6.1.0", + "futures-sink", + "futures-timer", + "futures-util", + "no-std-compat", + "nonzero_ext", + "parking_lot 0.12.1", + "portable-atomic", + "quanta", + "rand 0.8.5", + "smallvec", + "spinning_top", +] + [[package]] name = "group" version = "0.12.1" @@ -2630,7 +2677,7 @@ version = "7.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "byteorder", "crossbeam-channel", "flate2", @@ -2638,6 +2685,30 @@ dependencies = [ "num-traits", ] +[[package]] +name = "headers" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9" +dependencies = [ + "base64 0.21.7", + "bytes", + "headers-core", + "http 1.1.0", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4" +dependencies = [ + "http 1.1.0", +] + [[package]] name = "heck" version = "0.5.0" @@ -2775,12 +2846,10 @@ name = "http-utils" version = "0.1.0" dependencies = [ "anyhow", - "backtrace", "bytes", "fail", - "flate2", + "futures", "hyper 0.14.30", - "inferno 0.12.0", "itertools 0.10.5", "jemalloc_pprof", "metrics", @@ -2793,6 +2862,7 @@ dependencies = [ "serde_path_to_error", "thiserror 1.0.69", "tokio", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", "tracing", @@ -3279,9 +3349,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jemalloc_pprof" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992" dependencies = [ "anyhow", "libc", @@ -3365,7 +3435,7 @@ version = "9.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "js-sys", "pem", "ring", @@ -3480,9 +3550,9 @@ dependencies = [ [[package]] name = "mappings" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a" dependencies = [ "anyhow", "libc", @@ -3535,7 +3605,7 @@ dependencies = [ "measured-derive", "memchr", "parking_lot 0.12.1", - "rustc-hash", + "rustc-hash 1.1.0", "ryu", ] @@ -3723,6 +3793,12 @@ dependencies = [ "memoffset 0.9.0", ] +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -3733,6 +3809,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "notify" version = "8.0.0" @@ -3982,7 +4064,7 @@ dependencies = [ "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", - "prost", + "prost 0.13.3", "reqwest", "thiserror 1.0.69", ] @@ -3995,7 +4077,7 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost", + "prost 0.13.3", "tonic", ] @@ -4109,6 +4191,7 @@ dependencies = [ "pageserver_api", "pageserver_client", "rand 0.8.5", + "reqwest", "serde", "serde_json", "tokio", @@ -4198,6 +4281,9 @@ dependencies = [ "remote_storage", "reqwest", "rpds", + "rustls 0.23.18", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "scopeguard", "send-future", "serde", @@ -4216,6 +4302,7 @@ dependencies = [ "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-tar", "tokio-util", @@ -4223,6 +4310,7 @@ dependencies = [ "tracing", "url", "utils", + "uuid", "wal_decoder", "walkdir", "workspace_hack", @@ -4305,9 +4393,9 @@ dependencies = [ [[package]] name = "papaya" -version = "0.1.8" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c" +checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd" dependencies = [ "equivalent", "seize", @@ -4435,7 +4523,7 @@ version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "serde", ] @@ -4484,18 +4572,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" +checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.0" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" +checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", @@ -4589,6 +4677,12 @@ dependencies = [ "never-say-never", ] +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "postgres" version = "0.19.7" @@ -4743,8 +4837,10 @@ dependencies = [ "nix 0.26.4", "once_cell", "parking_lot 0.12.1", - "protobuf", - "protobuf-codegen-pure", + "prost 0.12.6", + "prost-build 0.12.6", + "prost-derive 0.12.6", + "sha2", "smallvec", "symbolic-demangle", "tempfile", @@ -4753,15 +4849,17 @@ dependencies = [ [[package]] name = "pprof_util" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416" dependencies = [ "anyhow", + "backtrace", "flate2", + "inferno 0.12.0", "num", "paste", - "prost", + "prost 0.13.3", ] [[package]] @@ -4854,6 +4952,16 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive 0.12.6", +] + [[package]] name = "prost" version = "0.13.3" @@ -4861,7 +4969,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.3", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools 0.10.5", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost 0.12.6", + "prost-types 0.12.6", + "regex", + "syn 2.0.90", + "tempfile", ] [[package]] @@ -4878,13 +5007,26 @@ dependencies = [ "once_cell", "petgraph", "prettyplease", - "prost", - "prost-types", + "prost 0.13.3", + "prost-types 0.13.3", "regex", "syn 2.0.90", "tempfile", ] +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools 0.10.5", + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "prost-derive" version = "0.13.3" @@ -4898,38 +5040,22 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost 0.12.6", +] + [[package]] name = "prost-types" version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ - "prost", -] - -[[package]] -name = "protobuf" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" - -[[package]] -name = "protobuf-codegen" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" -dependencies = [ - "protobuf", -] - -[[package]] -name = "protobuf-codegen-pure" -version = "2.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" -dependencies = [ - "protobuf", - "protobuf-codegen", + "prost 0.13.3", ] [[package]] @@ -5010,7 +5136,7 @@ dependencies = [ "reqwest-tracing", "rsa", "rstest", - "rustc-hash", + "rustc-hash 1.1.0", "rustls 0.23.18", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", @@ -5050,6 +5176,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quanta" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi 0.11.0+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quick-xml" version = "0.26.0" @@ -5180,6 +5321,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "raw-cpuid" +version = "11.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "rayon" version = "1.7.0" @@ -5514,16 +5664,16 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.6" +version = "0.17.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866" +checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" dependencies = [ "cc", + "cfg-if", "getrandom 0.2.11", "libc", - "spin", "untrusted", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -5628,6 +5778,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.0" @@ -5744,7 +5900,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", ] [[package]] @@ -5753,15 +5909,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" dependencies = [ - "base64 0.21.1", + "base64 0.21.7", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" [[package]] name = "rustls-webpki" @@ -5992,9 +6148,9 @@ dependencies = [ [[package]] name = "seize" -version = "0.4.9" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93" +checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7" dependencies = [ "libc", "windows-sys 0.52.0", @@ -6387,6 +6543,15 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + [[package]] name = "spki" version = "0.6.0" @@ -6438,7 +6603,7 @@ dependencies = [ "metrics", "once_cell", "parking_lot 0.12.1", - "prost", + "prost 0.13.3", "rustls 0.23.18", "tokio", "tonic", @@ -6456,6 +6621,7 @@ dependencies = [ "bytes", "chrono", "clap", + "clashmap", "control_plane", "cron", "diesel", @@ -6463,6 +6629,7 @@ dependencies = [ "diesel_migrations", "fail", "futures", + "governor", "hex", "http-utils", "humantime", @@ -7209,7 +7376,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.3", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "tokio", @@ -7229,8 +7396,8 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" dependencies = [ "prettyplease", "proc-macro2", - "prost-build", - "prost-types", + "prost-build 0.13.3", + "prost-types 0.13.3", "quote", "syn 2.0.90", ] @@ -7277,10 +7444,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" dependencies = [ + "base64 0.22.1", "bitflags 2.8.0", "bytes", "http 1.1.0", "http-body 1.0.0", + "mime", "pin-project-lite", "tower-layer", "tower-service", @@ -7294,6 +7463,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" +[[package]] +name = "tower-otel" +version = "0.2.0" +source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd" +dependencies = [ + "http 1.1.0", + "opentelemetry", + "pin-project", + "tower-layer", + "tower-service", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "tower-service" version = "0.3.3" @@ -7620,7 +7803,6 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", - "backtrace", "bincode", "byteorder", "bytes", @@ -7748,7 +7930,7 @@ dependencies = [ "pageserver_api", "postgres_ffi", "pprof", - "prost", + "prost 0.13.3", "remote_storage", "serde", "serde_json", @@ -8174,7 +8356,7 @@ dependencies = [ "ahash", "anyhow", "base64 0.13.1", - "base64 0.21.1", + "base64 0.21.7", "base64ct", "bytes", "camino", @@ -8205,6 +8387,7 @@ dependencies = [ "hyper-util", "indexmap 1.9.3", "indexmap 2.0.1", + "itertools 0.10.5", "itertools 0.12.1", "lazy_static", "libc", @@ -8223,7 +8406,7 @@ dependencies = [ "parquet", "prettyplease", "proc-macro2", - "prost", + "prost 0.13.3", "quote", "rand 0.8.5", "regex", diff --git a/Cargo.toml b/Cargo.toml index e6ca3c982c..c59c4c5435 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ members = [ ] [workspace.package] -edition = "2021" +edition = "2024" license = "Apache-2.0" ## All dependency versions, used in the project @@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" -backtrace = "0.3.74" flate2 = "1.0.26" assert-json-diff = "2" async-stream = "0.3" @@ -68,9 +67,10 @@ aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } +axum-extra = { version = "0.10.0", features = ["typed-header"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.70" +bindgen = "0.71" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" @@ -95,6 +95,7 @@ futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" +governor = "0.8" hashbrown = "0.14" hashlink = "0.9.1" hdrhistogram = "7.5.2" @@ -113,11 +114,10 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" -inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" -jemalloc_pprof = "0.6" +jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] } jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -139,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] } +pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -155,6 +155,7 @@ rpds = "0.13" rustc-hash = "1.1.0" rustls = { version = "0.23.16", default-features = false } rustls-pemfile = "2" +rustls-pki-types = "1.11" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" @@ -192,7 +193,11 @@ toml = "0.8" toml_edit = "0.22" tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } -tower-http = { version = "0.6.2", features = ["request-id", "trace"] } +tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } + +# This revision uses opentelemetry 0.27. There's no tag for it. +tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" } + tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" diff --git a/Makefile b/Makefile index 42ee643bb5..0911465fb8 100644 --- a/Makefile +++ b/Makefile @@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu # BUILD_TYPE ?= debug WITH_SANITIZERS ?= no +PG_CFLAGS = -fsigned-char ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl - PG_CFLAGS = -O2 -g3 $(CFLAGS) + PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend - PG_CFLAGS = -O0 -g3 $(CFLAGS) + PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) @@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + +@echo "Compiling pg_trgm $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install +@echo "Compiling amcheck $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install +@echo "Compiling test_decoding $*" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 0cdb44853f..6e46185e36 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -162,7 +162,7 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ - export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ + export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 @@ -1484,7 +1484,7 @@ WORKDIR /ext-src COPY compute/patches/pg_duckdb_v031.patch . COPY compute/patches/duckdb_v120.patch . # pg_duckdb build requires source dir to be a git repo to get submodules -# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: # - extension management function duckdb.install_extension() # - access to duckdb.extensions table and its sequence RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ @@ -1499,8 +1499,8 @@ ARG PG_VERSION COPY --from=pg_duckdb-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_duckdb-src RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control - + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1758,15 +1758,15 @@ ARG TARGETARCH # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` RUN if [ "$TARGETARCH" = "amd64" ]; then\ - postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ + postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ else\ - postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\ + postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ fi\ - && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\ + && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ @@ -1933,6 +1933,7 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + rsyslog \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 @@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo # Make the libraries we built available RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +# rsyslog config permissions +# directory for rsyslogd pid file +RUN mkdir /var/run/rsyslogd && \ + chown -R postgres:postgres /var/run/rsyslogd && \ + chown -R postgres:postgres /etc/rsyslog.d/ + + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index f8f4cab63b..da2b86d542 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -29,6 +29,7 @@ import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_chunk_size.libsonnet', import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql index 9cbbdfd8a3..fe0360ab5c 100644 --- a/compute/etc/sql_exporter/db_total_size.sql +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -1 +1,5 @@ -SELECT sum(pg_database_size(datname)) AS total FROM pg_database; +SELECT sum(pg_database_size(datname)) AS total +FROM pg_database +-- Ignore invalid databases, as we will likely have problems with +-- getting their size from the Pageserver. +WHERE datconnlimit != -2; diff --git a/compute/etc/sql_exporter/lfc_chunk_size.libsonnet b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet new file mode 100644 index 0000000000..bbe56f869f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_chunk_size', + type: 'gauge', + help: 'LFC chunk size, measured in 8KiB pages', + key_labels: null, + values: [ + 'lfc_chunk_size_pages', + ], + query: importstr 'sql_exporter/lfc_chunk_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_chunk_size.sql b/compute/etc/sql_exporter/lfc_chunk_size.sql new file mode 100644 index 0000000000..0905870064 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_chunk_size.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages'; diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql index 00ada87370..12e6c4ae59 100644 --- a/compute/etc/sql_exporter/pg_stats_userdb.sql +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -1,10 +1,20 @@ -- We export stats for 10 non-system databases. Without this limit it is too -- easy to abuse the system by creating lots of databases. -SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, - tup_updated AS updated, tup_deleted AS deleted, datname +SELECT pg_database_size(datname) AS db_size, + deadlocks, + tup_inserted AS inserted, + tup_updated AS updated, + tup_deleted AS deleted, + datname FROM pg_stat_database WHERE datname IN ( SELECT datname FROM pg_database - WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 + -- Ignore invalid databases, as we will likely have problems with + -- getting their size from the Pageserver. + WHERE datconnlimit != -2 + AND datname <> 'postgres' + AND NOT datistemplate + ORDER BY oid + LIMIT 10 ); diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 568f0b0444..e6707381ac 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -39,17 +39,26 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -64,6 +73,12 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -127,6 +142,12 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN chmod 0666 /var/log/ + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 6617c98599..c89ee112dc 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -39,17 +39,26 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + - name: rsyslogd + user: postgres + sysvInitAction: respawn + shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - filename: compute_ctl-sudoers content: | + # Reverse hostname lookup doesn't currently work, and isn't needed anyway when all + # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to + # resolve host" log messages that they generate. + Defaults !fqdn + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) # # Also allow it to shut down the VM. The fast_import job does that when it's finished. - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -64,6 +73,12 @@ files: } memory {} } +# Create dummy rsyslog config, because it refuses to start without at least one action configured. +# compute_ctl will rewrite this file with the actual configuration, if needed. + - filename: compute_rsyslog.conf + content: | + *.* /dev/null + $IncludeConfig /etc/rsyslog.d/*.conf build: | # Build cgroup-tools # @@ -123,6 +138,11 @@ merge: | RUN set -e \ && chmod 0644 /etc/cgconfig.conf + COPY compute_rsyslog.conf /etc/compute_rsyslog.conf + RUN chmod 0666 /etc/compute_rsyslog.conf + RUN chmod 0666 /var/log/ + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c276996df5..dd2896714d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_tools" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } +axum-extra.workspace = true camino.workspace = true chrono.workspace = true cfg-if.workspace = true @@ -25,6 +26,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -46,7 +48,9 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tower-otel.workspace = true tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 1cdae718fe..fc7a3e2827 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -33,41 +33,28 @@ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` -use std::collections::HashMap; use std::ffi::OsString; use std::fs::File; use std::path::Path; use std::process::exit; -use std::str::FromStr; -use std::sync::atomic::Ordering; -use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::{thread, time::Duration}; +use std::sync::mpsc; +use std::thread; +use std::time::Duration; use anyhow::{Context, Result}; -use chrono::Utc; use clap::Parser; -use compute_tools::disk_quota::set_disk_quota; -use compute_tools::http::server::Server; -use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; -use signal_hook::consts::{SIGQUIT, SIGTERM}; -use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info, warn}; -use url::Url; - -use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::ComputeSpec; - -use compute_tools::compute::{ - forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, -}; -use compute_tools::configurator::launch_configurator; +use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal}; use compute_tools::extension_server::get_pg_version_string; use compute_tools::logger::*; -use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; -use compute_tools::swap::resize_swap; -use rlimit::{setrlimit, Resource}; +use rlimit::{Resource, setrlimit}; +use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM}; +use signal_hook::iterator::Signals; +use tracing::{error, info}; +use url::Url; use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes @@ -149,6 +136,8 @@ struct Cli { fn main() -> Result<()> { let cli = Cli::parse(); + let scenario = failpoint_support::init(); + // For historical reasons, the main thread that processes the spec and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) @@ -160,34 +149,44 @@ fn main() -> Result<()> { let build_tag = runtime.block_on(init())?; - let scenario = failpoint_support::init(); - // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; - let (pg_handle, start_pg_result) = { - // Enter startup tracing context - let _startup_context_guard = startup_context_from_env(); + let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let cli_spec = try_spec_from_cli(&cli)?; + let cli_spec = try_spec_from_cli(&cli)?; - let compute = wait_spec(build_tag, &cli, cli_spec)?; + let compute_node = ComputeNode::new( + ComputeNodeParams { + compute_id: cli.compute_id, + connstr, + pgdata: cli.pgdata.clone(), + pgbin: cli.pgbin.clone(), + pgversion: get_pg_version_string(&cli.pgbin), + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port, + ext_remote_storage: cli.remote_ext_config.clone(), + resize_swap_on_bind: cli.resize_swap_on_bind, + set_disk_quota_for_fs: cli.set_disk_quota_for_fs, + #[cfg(target_os = "linux")] + filecache_connstr: cli.filecache_connstr, + #[cfg(target_os = "linux")] + cgroup: cli.cgroup, + #[cfg(target_os = "linux")] + vm_monitor_addr: cli.vm_monitor_addr, + build_tag, - start_postgres(&cli, compute)? + live_config_allowed: cli_spec.live_config_allowed, + }, + cli_spec.spec, + cli_spec.compute_ctl_config, + )?; - // Startup is finished, exit the startup tracing span - }; - - // PostgreSQL is now running, if startup was successful. Wait until it exits. - let wait_pg_result = wait_postgres(pg_handle)?; - - let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; - - maybe_delay_exit(delay_exit); + let exit_code = compute_node.run()?; scenario.teardown(); - deinit_and_exit(wait_pg_result); + deinit_and_exit(exit_code); } async fn init() -> Result { @@ -208,56 +207,6 @@ async fn init() -> Result { Ok(build_tag) } -fn startup_context_from_env() -> Option { - // Extract OpenTelemetry context for the startup actions from the - // TRACEPARENT and TRACESTATE env variables, and attach it to the current - // tracing context. - // - // This is used to propagate the context for the 'start_compute' operation - // from the neon control plane. This allows linking together the wider - // 'start_compute' operation that creates the compute container, with the - // startup actions here within the container. - // - // There is no standard for passing context in env variables, but a lot of - // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See - // https://github.com/open-telemetry/opentelemetry-specification/issues/740 - // - // Switch to the startup context here, and exit it once the startup has - // completed and Postgres is up and running. - // - // If this pod is pre-created without binding it to any particular endpoint - // yet, this isn't the right place to enter the startup context. In that - // case, the control plane should pass the tracing context as part of the - // /configure API call. - // - // NOTE: This is supposed to only cover the *startup* actions. Once - // postgres is configured and up-and-running, we exit this span. Any other - // actions that are performed on incoming HTTP requests, for example, are - // performed in separate spans. - // - // XXX: If the pod is restarted, we perform the startup actions in the same - // context as the original startup actions, which probably doesn't make - // sense. - let mut startup_tracing_carrier: HashMap = HashMap::new(); - if let Ok(val) = std::env::var("TRACEPARENT") { - startup_tracing_carrier.insert("traceparent".to_string(), val); - } - if let Ok(val) = std::env::var("TRACESTATE") { - startup_tracing_carrier.insert("tracestate".to_string(), val); - } - if !startup_tracing_carrier.is_empty() { - use opentelemetry::propagation::TextMapPropagator; - use opentelemetry_sdk::propagation::TraceContextPropagator; - let guard = TraceContextPropagator::new() - .extract(&startup_tracing_carrier) - .attach(); - info!("startup tracing context attached"); - Some(guard) - } else { - None - } -} - fn try_spec_from_cli(cli: &Cli) -> Result { // First, try to get cluster spec from the cli argument if let Some(ref spec_json) = cli.spec_json { @@ -308,342 +257,7 @@ struct CliSpecParams { live_config_allowed: bool, } -fn wait_spec( - build_tag: String, - cli: &Cli, - CliSpecParams { - spec, - live_config_allowed, - compute_ctl_config: _, - }: CliSpecParams, -) -> Result> { - let mut new_state = ComputeState::new(); - let spec_set; - - if let Some(spec) = spec { - let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; - info!("new pspec.spec: {:?}", pspec.spec); - new_state.pspec = Some(pspec); - spec_set = true; - } else { - spec_set = false; - } - let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let conn_conf = postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) - .context("cannot build tokio postgres config from connstr")?; - let compute_node = ComputeNode { - compute_id: cli.compute_id.clone(), - connstr, - conn_conf, - tokio_conn_conf, - pgdata: cli.pgdata.clone(), - pgbin: cli.pgbin.clone(), - pgversion: get_pg_version_string(&cli.pgbin), - external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port, - live_config_allowed, - state: Mutex::new(new_state), - state_changed: Condvar::new(), - ext_remote_storage: cli.remote_ext_config.clone(), - ext_download_progress: RwLock::new(HashMap::new()), - build_tag, - }; - let compute = Arc::new(compute_node); - - // If this is a pooled VM, prewarm before starting HTTP server and becoming - // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have its memory allocated from the host, and - // the necessary binaries will already be cached. - if !spec_set { - compute.prewarm_postgres()?; - } - - // Launch the external HTTP server first, so that we can serve control plane - // requests while configuration is still in progress. - Server::External(cli.external_http_port).launch(&compute); - - // The internal HTTP server could be launched later, but there isn't much - // sense in waiting. - Server::Internal(cli.internal_http_port).launch(&compute); - - if !spec_set { - // No spec provided, hang waiting for it. - info!("no compute spec provided, waiting"); - - let mut state = compute.state.lock().unwrap(); - while state.status != ComputeStatus::ConfigurationPending { - state = compute.state_changed.wait(state).unwrap(); - - if state.status == ComputeStatus::ConfigurationPending { - info!("got spec, continue configuration"); - // Spec is already set by the http server handler. - break; - } - } - - // Record for how long we slept waiting for the spec. - let now = Utc::now(); - state.metrics.wait_for_spec_ms = now - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - - // Reset start time, so that the total startup time that is calculated later will - // not include the time that we waited for the spec. - state.start_time = now; - } - - launch_lsn_lease_bg_task_for_static(&compute); - - Ok(compute) -} - -fn start_postgres( - cli: &Cli, - compute: Arc, -) -> Result<(Option, StartPostgresResult)> { - // We got all we need, update the state. - let mut state = compute.state.lock().unwrap(); - state.set_status(ComputeStatus::Init, &compute.state_changed); - - info!( - "running compute with features: {:?}", - state.pspec.as_ref().unwrap().spec.features - ); - // before we release the mutex, fetch some parameters for later. - let &ComputeSpec { - swap_size_bytes, - disk_quota_bytes, - #[cfg(target_os = "linux")] - disable_lfc_resizing, - .. - } = &state.pspec.as_ref().unwrap().spec; - drop(state); - - // Launch remaining service threads - let _monitor_handle = launch_monitor(&compute); - let _configurator_handle = launch_configurator(&compute); - - let mut prestartup_failed = false; - let mut delay_exit = false; - - // Resize swap to the desired size if the compute spec says so - if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) { - // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion - // *before* starting postgres. - // - // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this - // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets - // OOM-killed during startup because swap wasn't available yet. - match resize_swap(size_bytes) { - Ok(()) => { - let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%size_bytes, %size_mib, "resized swap"); - } - Err(err) => { - let err = err.context("failed to resize swap"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Set disk quota if the compute spec says so - if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = - (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref()) - { - match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) { - Ok(()) => { - let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%disk_quota_bytes, %size_mib, "set disk quota"); - } - Err(err) => { - let err = err.context("failed to set disk quota"); - error!("{err:#}"); - - // Mark compute startup as failed; don't try to start postgres, and report this - // error to the control plane when it next asks. - prestartup_failed = true; - compute.set_failed_status(err); - delay_exit = true; - } - } - } - - // Start Postgres - let mut pg = None; - if !prestartup_failed { - pg = match compute.start_compute() { - Ok(pg) => { - info!(postmaster_pid = %pg.0.id(), "Postgres was started"); - Some(pg) - } - Err(err) => { - error!("could not start the compute node: {:#}", err); - compute.set_failed_status(err); - delay_exit = true; - None - } - }; - } else { - warn!("skipping postgres startup because pre-startup step failed"); - } - - // Start the vm-monitor if directed to. The vm-monitor only runs on linux - // because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - use std::env; - use tokio_util::sync::CancellationToken; - - // This token is used internally by the monitor to clean up all threads - let token = CancellationToken::new(); - - // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC - let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { - None - } else { - Some(cli.filecache_connstr.clone()) - }; - - let vm_monitor = if env::var_os("AUTOSCALING").is_some() { - let vm_monitor = tokio::spawn(vm_monitor::start( - Box::leak(Box::new(vm_monitor::Args { - cgroup: Some(cli.cgroup.clone()), - pgconnstr, - addr: cli.vm_monitor_addr.clone(), - })), - token.clone(), - )); - Some(vm_monitor) - } else { - None - }; - } - } - - Ok(( - pg, - StartPostgresResult { - delay_exit, - compute, - #[cfg(target_os = "linux")] - token, - #[cfg(target_os = "linux")] - vm_monitor, - }, - )) -} - -type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); - -struct StartPostgresResult { - delay_exit: bool, - // passed through from WaitSpecResult - compute: Arc, - - #[cfg(target_os = "linux")] - token: tokio_util::sync::CancellationToken, - #[cfg(target_os = "linux")] - vm_monitor: Option>>, -} - -fn wait_postgres(pg: Option) -> Result { - // Wait for the child Postgres process forever. In this state Ctrl+C will - // propagate to Postgres and it will be shut down as well. - let mut exit_code = None; - if let Some((mut pg, logs_handle)) = pg { - info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit"); - - let ecode = pg - .wait() - .expect("failed to start waiting on Postgres process"); - PG_PID.store(0, Ordering::SeqCst); - - // Process has exited. Wait for the log collecting task to finish. - let _ = tokio::runtime::Handle::current() - .block_on(logs_handle) - .map_err(|e| tracing::error!("log task panicked: {:?}", e)); - - info!("Postgres exited with code {}, shutting down", ecode); - exit_code = ecode.code() - } - - Ok(WaitPostgresResult { exit_code }) -} - -struct WaitPostgresResult { - exit_code: Option, -} - -fn cleanup_after_postgres_exit( - StartPostgresResult { - mut delay_exit, - compute, - #[cfg(target_os = "linux")] - vm_monitor, - #[cfg(target_os = "linux")] - token, - }: StartPostgresResult, -) -> Result { - // Terminate the vm_monitor so it releases the file watcher on - // /sys/fs/cgroup/neon-postgres. - // Note: the vm-monitor only runs on linux because it requires cgroups. - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - if let Some(handle) = vm_monitor { - // Kills all threads spawned by the monitor - token.cancel(); - // Kills the actual task running the monitor - handle.abort(); - } - } - } - - // Maybe sync safekeepers again, to speed up next startup - let compute_state = compute.state.lock().unwrap().clone(); - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { - info!("syncing safekeepers on shutdown"); - let storage_auth_token = pspec.storage_auth_token.clone(); - let lsn = compute.sync_safekeepers(storage_auth_token)?; - info!("synced safekeepers at lsn {lsn}"); - } - - let mut state = compute.state.lock().unwrap(); - if state.status == ComputeStatus::TerminationPending { - state.status = ComputeStatus::Terminated; - compute.state_changed.notify_all(); - // we were asked to terminate gracefully, don't exit to avoid restart - delay_exit = true - } - drop(state); - - if let Err(err) = compute.check_for_core_dumps() { - error!("error while checking for core dumps: {err:?}"); - } - - Ok(delay_exit) -} - -fn maybe_delay_exit(delay_exit: bool) { - // If launch failed, keep serving HTTP requests for a while, so the cloud - // control plane can get the actual error. - if delay_exit { - info!("giving control plane 30s to collect the error before shutdown"); - thread::sleep(Duration::from_secs(30)); - } -} - -fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { +fn deinit_and_exit(exit_code: Option) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 585f3e4e1d..47558be7a0 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,13 +25,13 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; +use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; use nix::unistd::Pid; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; use utils::fs_ext::is_directory_empty; #[path = "fast_import/aws_s3_sync.rs"] @@ -558,7 +558,9 @@ async fn cmd_dumprestore( decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) .await? } else { - bail!("destination connection string must be provided in spec for dump_restore command"); + bail!( + "destination connection string must be provided in spec for dump_restore command" + ); }; (source, dest) diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 1be10b36d6..d8d007da71 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,11 +1,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use tokio::task::JoinSet; +use tracing::{info, warn}; use walkdir::WalkDir; use super::s3_uri::S3Uri; -use tracing::{info, warn}; - const MAX_PARALLEL_UPLOADS: usize = 10; /// Upload all files from 'local' to 'remote' diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index 52bbef420f..cf4dab7c02 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -1,6 +1,7 @@ -use anyhow::Result; use std::str::FromStr; +use anyhow::Result; + /// Struct to hold parsed S3 components #[derive(Debug, Clone, PartialEq, Eq)] pub struct S3Uri { diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 28b10ce21c..db3e07e086 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,18 +1,20 @@ +use std::path::Path; +use std::process::Stdio; +use std::result::Result; +use std::sync::Arc; + +use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; -use std::{path::Path, process::Stdio, result::Result, sync::Arc}; -use tokio::{ - io::{AsyncBufReadExt, BufReader}, - process::Command, - spawn, -}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::spawn; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; -use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); @@ -55,15 +57,15 @@ pub enum SchemaDumpError { pub async fn get_database_schema( compute: &Arc, dbname: &str, -) -> Result>, SchemaDumpError> { - let pgbin = &compute.pgbin; +) -> Result> + use<>, SchemaDumpError> { + let pgbin = &compute.params.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); // Replace the DB in the connection string and disable it to parts. // This is the only option to handle DBs with special characters. - let conf = - postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?; + let conf = postgres_conf_for_db(&compute.params.connstr, dbname) + .map_err(|_| SchemaDumpError::Unexpected)?; let host = conf .get_hosts() .first() diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 62d61a8bc9..e4207876ac 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Ok, Result}; +use anyhow::{Ok, Result, anyhow}; use tokio_postgres::NoTls; use tracing::{error, instrument, warn}; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d323ea3dcd..354528e2cd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,74 +1,83 @@ -use std::collections::{HashMap, HashSet}; -use std::env; -use std::fs; -use std::iter::once; -use std::os::unix::fs::{symlink, PermissionsExt}; +use std::collections::HashMap; +use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::AtomicU32; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::time::Duration; -use std::time::Instant; +use std::time::{Duration, Instant}; +use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::{Database, PgIdent, Role}; +use compute_api::privilege::Privilege; +use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::spec::{ + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, +}; +use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; -use futures::StreamExt; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use postgres; -use postgres::error::SqlState; use postgres::NoTls; -use tracing::{debug, error, info, instrument, warn}; -use utils::id::{TenantId, TimelineId}; -use utils::lsn::Lsn; - -use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; -use utils::measured_stream::MeasuredReader; - -use nix::sys::signal::{kill, Signal}; +use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::measured_stream::MeasuredReader; +use crate::configurator::launch_configurator; +use crate::disk_quota::set_disk_quota; use crate::installed_extensions::get_installed_extensions; -use crate::local_proxy; +use crate::logger::startup_context_from_env; +use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; +use crate::monitor::launch_monitor; use crate::pg_helpers::*; +use crate::rsyslog::configure_audit_rsyslog; use crate::spec::*; -use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, - CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, - HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, - RunInEachDatabase, -}; -use crate::spec_apply::PerDatabasePhase; -use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, -}; -use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; +use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; -use crate::{config, extension_server}; +use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); -/// Compute node info shared across several `compute_ctl` threads. -pub struct ComputeNode { +/// Static configuration params that don't change after startup. These mostly +/// come from the CLI args, or are derived from them. +pub struct ComputeNodeParams { /// The ID of the compute pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, - // We connect to Postgres from many different places, so build configs once - // and reuse them where needed. - pub conn_conf: postgres::config::Config, - pub tokio_conn_conf: tokio_postgres::config::Config, + + pub resize_swap_on_bind: bool, + pub set_disk_quota_for_fs: Option, + + // VM monitor parameters + #[cfg(target_os = "linux")] + pub filecache_connstr: String, + #[cfg(target_os = "linux")] + pub cgroup: String, + #[cfg(target_os = "linux")] + pub vm_monitor_addr: String, + pub pgdata: String, pub pgbin: String, pub pgversion: String, + pub build_tag: String, + + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, + + /// the address of extension storage proxy gateway + pub ext_remote_storage: Option, + /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -82,10 +91,17 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's external HTTP server listens on - pub external_http_port: u16, - /// The port that the compute's internal HTTP server listens on - pub internal_http_port: u16, +} + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub params: ComputeNodeParams, + + // We connect to Postgres from many different places, so build configs once + // and reuse them where needed. These are derived from 'params.connstr' + pub conn_conf: postgres::config::Config, + pub tokio_conn_conf: tokio_postgres::config::Config, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -93,11 +109,9 @@ pub struct ComputeNode { pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, - /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, - pub build_tag: String, } // store some metrics about download size that might impact startup time @@ -116,7 +130,25 @@ pub struct ComputeState { /// compute wasn't used since start. pub last_active: Option>, pub error: Option, + + /// Compute spec. This can be received from the CLI or - more likely - + /// passed by the control plane with a /configure HTTP request. pub pspec: Option, + + pub compute_ctl_config: ComputeCtlConfig, + + /// If the spec is passed by a /configure request, 'startup_span' is the + /// /configure request's tracing span. The main thread enters it when it + /// processes the compute startup, so that the compute startup is considered + /// to be part of the /configure request for tracing purposes. + /// + /// If the request handling thread/task called startup_compute() directly, + /// it would automatically be a child of the request handling span, and we + /// wouldn't need this. But because we use the main thread to perform the + /// startup, and the /configure task just waits for it to finish, we need to + /// set up the span relationship ourselves. + pub startup_span: Option, + pub metrics: ComputeMetrics, } @@ -128,6 +160,8 @@ impl ComputeState { last_active: None, error: None, pspec: None, + compute_ctl_config: ComputeCtlConfig::default(), + startup_span: None, metrics: ComputeMetrics::default(), } } @@ -244,80 +278,518 @@ fn maybe_cgexec(cmd: &str) -> Command { } } -pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String { - let roles = spec - .cluster - .roles - .iter() - .map(|r| escape_literal(&r.name)) - .collect::>(); +struct PostgresHandle { + postgres: std::process::Child, + log_collector: tokio::task::JoinHandle>, +} - let dbs = spec - .cluster - .databases - .iter() - .map(|db| escape_literal(&db.name)) - .collect::>(); +impl PostgresHandle { + /// Return PID of the postgres (postmaster) process + fn pid(&self) -> Pid { + Pid::from_raw(self.postgres.id() as i32) + } +} - let roles_decl = if roles.is_empty() { - String::from("roles text[] := NULL;") - } else { - format!( - r#" - roles text[] := ARRAY(SELECT rolname - FROM pg_catalog.pg_roles - WHERE rolname IN ({}));"#, - roles.join(", ") - ) - }; - - let database_decl = if dbs.is_empty() { - String::from("dbs text[] := NULL;") - } else { - format!( - r#" - dbs text[] := ARRAY(SELECT datname - FROM pg_catalog.pg_database - WHERE datname IN ({}));"#, - dbs.join(", ") - ) - }; - - // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases - // (see https://www.postgresql.org/docs/current/ddl-priv.html) - let query = format!( - r#" - DO $$ - DECLARE - r text; - {} - {} - BEGIN - IF NOT EXISTS ( - SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') - THEN - CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; - IF array_length(roles, 1) IS NOT NULL THEN - EXECUTE format('GRANT neon_superuser TO %s', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', ')); - FOREACH r IN ARRAY roles LOOP - EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r)); - END LOOP; - END IF; - IF array_length(dbs, 1) IS NOT NULL THEN - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser', - array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', ')); - END IF; - END IF; - END - $$;"#, - roles_decl, database_decl, - ); - - query +struct StartVmMonitorResult { + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, } impl ComputeNode { + pub fn new( + params: ComputeNodeParams, + cli_spec: Option, + compute_ctl_config: ComputeCtlConfig, + ) -> Result { + let connstr = params.connstr.as_str(); + let conn_conf = postgres::config::Config::from_str(connstr) + .context("cannot build postgres config from connstr")?; + let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + .context("cannot build tokio postgres config from connstr")?; + + let mut new_state = ComputeState::new(); + if let Some(cli_spec) = cli_spec { + let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; + new_state.pspec = Some(pspec); + } + new_state.compute_ctl_config = compute_ctl_config; + + Ok(ComputeNode { + params, + conn_conf, + tokio_conn_conf, + state: Mutex::new(new_state), + state_changed: Condvar::new(), + ext_download_progress: RwLock::new(HashMap::new()), + }) + } + + /// Top-level control flow of compute_ctl. Returns a process exit code we should + /// exit with. + pub fn run(self) -> Result> { + let this = Arc::new(self); + + let cli_spec = this.state.lock().unwrap().pspec.clone(); + + // If this is a pooled VM, prewarm before starting HTTP server and becoming + // available for binding. Prewarming helps Postgres start quicker later, + // because QEMU will already have its memory allocated from the host, and + // the necessary binaries will already be cached. + if cli_spec.is_none() { + this.prewarm_postgres()?; + } + + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + crate::http::server::Server::External { + port: this.params.external_http_port, + jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(), + compute_id: this.params.compute_id.clone(), + } + .launch(&this); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + crate::http::server::Server::Internal { + port: this.params.internal_http_port, + } + .launch(&this); + + // If we got a spec from the CLI already, use that. Otherwise wait for the + // control plane to pass it to us with a /configure HTTP request + let pspec = if let Some(cli_spec) = cli_spec { + cli_spec + } else { + this.wait_spec()? + }; + + launch_lsn_lease_bg_task_for_static(&this); + + // We have a spec, start the compute + let mut delay_exit = false; + let mut vm_monitor = None; + let mut pg_process: Option = None; + + match this.start_compute(&mut pg_process) { + Ok(()) => { + // Success! Launch remaining services (just vm-monitor currently) + vm_monitor = + Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false))); + } + Err(err) => { + // Something went wrong with the startup. Log it and expose the error to + // HTTP status requests. + error!("could not start the compute node: {:#}", err); + this.set_failed_status(err); + delay_exit = true; + + // If the error happened after starting PostgreSQL, kill it + if let Some(ref pg_process) = pg_process { + kill(pg_process.pid(), Signal::SIGQUIT).ok(); + } + } + } + + // If startup was successful, or it failed in the late stages, + // PostgreSQL is now running. Wait until it exits. + let exit_code = if let Some(pg_handle) = pg_process { + let exit_status = this.wait_postgres(pg_handle); + info!("Postgres exited with code {}, shutting down", exit_status); + exit_status.code() + } else { + None + }; + + // Terminate the vm_monitor so it releases the file watcher on + // /sys/fs/cgroup/neon-postgres. + // Note: the vm-monitor only runs on linux because it requires cgroups. + if let Some(vm_monitor) = vm_monitor { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + // Kills all threads spawned by the monitor + vm_monitor.token.cancel(); + if let Some(handle) = vm_monitor.vm_monitor { + // Kills the actual task running the monitor + handle.abort(); + } + } else { + _ = vm_monitor; // appease unused lint on macOS + } + } + } + + // Reap the postgres process + delay_exit |= this.cleanup_after_postgres_exit()?; + + // If launch failed, keep serving HTTP requests for a while, so the cloud + // control plane can get the actual error. + if delay_exit { + info!("giving control plane 30s to collect the error before shutdown"); + std::thread::sleep(Duration::from_secs(30)); + } + Ok(exit_code) + } + + pub fn wait_spec(&self) -> Result { + info!("no compute spec provided, waiting"); + let mut state = self.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = self.state_changed.wait(state).unwrap(); + } + + info!("got spec, continue configuration"); + let spec = state.pspec.as_ref().unwrap().clone(); + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; + + Ok(spec) + } + + /// Start compute. + /// + /// Prerequisites: + /// - the compute spec has been placed in self.state.pspec + /// + /// On success: + /// - status is set to ComputeStatus::Running + /// - self.running_postgres is set + /// + /// On error: + /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed + /// - if Postgres was started before the fatal error happened, self.running_postgres is + /// set. The caller is responsible for killing it. + /// + /// Note that this is in the critical path of a compute cold start. Keep this fast. + /// Try to do things concurrently, to hide the latencies. + fn start_compute(self: &Arc, pg_handle: &mut Option) -> Result<()> { + let compute_state: ComputeState; + + let start_compute_span; + let _this_entered; + { + let mut state_guard = self.state.lock().unwrap(); + + // Create a tracing span for the startup operation. + // + // We could otherwise just annotate the function with #[instrument], but if + // we're being configured from a /configure HTTP request, we want the + // startup to be considered part of the /configure request. + // + // Similarly, if a trace ID was passed in env variables, attach it to the span. + start_compute_span = { + // Temporarily enter the parent span, so that the new span becomes its child. + if let Some(p) = state_guard.startup_span.take() { + let _parent_entered = p.entered(); + tracing::info_span!("start_compute") + } else if let Some(otel_context) = startup_context_from_env() { + use tracing_opentelemetry::OpenTelemetrySpanExt; + let span = tracing::info_span!("start_compute"); + span.set_parent(otel_context); + span + } else { + tracing::info_span!("start_compute") + } + }; + _this_entered = start_compute_span.enter(); + + state_guard.set_status(ComputeStatus::Init, &self.state_changed); + compute_state = state_guard.clone() + } + + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), + pspec.spec.operation_uuid.as_deref().unwrap_or("None"), + pspec.tenant_id, + pspec.timeline_id, + pspec.spec.features, + pspec.spec.remote_extensions, + ); + + ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process + + // Collect all the tasks that must finish here + let mut pre_tasks = tokio::task::JoinSet::new(); + + // If there are any remote extensions in shared_preload_libraries, start downloading them + if pspec.spec.remote_extensions.is_some() { + let (this, spec) = (self.clone(), pspec.spec.clone()); + pre_tasks.spawn(async move { + this.download_preload_extensions(&spec) + .in_current_span() + .await + }); + } + + // Prepare pgdata directory. This downloads the basebackup, among other things. + { + let (this, cs) = (self.clone(), compute_state.clone()); + pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs)); + } + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = + (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind) + { + pre_tasks.spawn_blocking_child(move || { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + resize_swap(size_bytes).context("failed to resize swap")?; + let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_mib, "resized swap"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // Set disk quota if the compute spec says so + if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = ( + pspec.spec.disk_quota_bytes, + self.params.set_disk_quota_for_fs.as_ref(), + ) { + let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone(); + pre_tasks.spawn_blocking_child(move || { + set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) + .context("failed to set disk quota")?; + let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%disk_quota_bytes, %size_mib, "set disk quota"); + + Ok::<(), anyhow::Error>(()) + }); + } + + // tune pgbouncer + if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { + info!("tuning pgbouncer"); + + // Spawn a background task to do the tuning, + // so that we don't block the main thread that starts Postgres. + let pgbouncer_settings = pgbouncer_settings.clone(); + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; + if let Err(err) = res { + error!("error while tuning pgbouncer: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // configure local_proxy + if let Some(local_proxy) = &pspec.spec.local_proxy_config { + info!("configuring local_proxy"); + + // Spawn a background task to do the configuration, + // so that we don't block the main thread that starts Postgres. + let local_proxy = local_proxy.clone(); + let _handle = tokio::spawn(async move { + if let Err(err) = local_proxy::configure(&local_proxy) { + error!("error while configuring local_proxy: {err:?}"); + // Continue with the startup anyway + } + }); + } + + // Configure and start rsyslog if necessary + if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { + let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + // TODO: make this more robust + // now rsyslog starts once and there is no monitoring or restart if it fails + configure_audit_rsyslog( + log_directory_path.to_str().unwrap(), + "hipaa", + &remote_endpoint, + )?; + } + + // Launch remaining service threads + let _monitor_handle = launch_monitor(self); + let _configurator_handle = launch_configurator(self); + + // Wait for all the pre-tasks to finish before starting postgres + let rt = tokio::runtime::Handle::current(); + while let Some(res) = rt.block_on(pre_tasks.join_next()) { + res??; + } + + ////// START POSTGRES + let start_time = Utc::now(); + let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; + let postmaster_pid = pg_process.pid(); + *pg_handle = Some(pg_process); + + // If this is a primary endpoint, perform some post-startup configuration before + // opening it up for the world. + let config_time = Utc::now(); + if pspec.spec.mode == ComputeMode::Primary { + self.configure_as_primary(&compute_state)?; + + let conf = self.get_conn_conf(None); + tokio::task::spawn_blocking(|| { + let res = get_installed_extensions(conf); + match res { + Ok(extensions) => { + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&extensions) + .expect("failed to serialize extensions list") + ); + } + Err(err) => error!("could not get installed extensions: {err:?}"), + } + }); + } + + // All done! + let startup_end_time = Utc::now(); + let metrics = { + let mut state = self.state.lock().unwrap(); + state.metrics.start_postgres_ms = config_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.config_ms = startup_end_time + .signed_duration_since(config_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time + .signed_duration_since(compute_state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + state.metrics.clone() + }; + self.set_status(ComputeStatus::Running); + + // Log metrics so that we can search for slow operations in logs + info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + + Ok(()) + } + + #[instrument(skip_all)] + async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> { + let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions { + remote_extensions + } else { + return Ok(()); + }; + + // First, create control files for all available extensions + extension_server::create_control_files(remote_extensions, &self.params.pgbin); + + let library_load_start_time = Utc::now(); + let remote_ext_metrics = self.prepare_preload_libraries(spec).await?; + + let library_load_time = Utc::now() + .signed_duration_since(library_load_start_time) + .to_std() + .unwrap() + .as_millis() as u64; + let mut state = self.state.lock().unwrap(); + state.metrics.load_ext_ms = library_load_time; + state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; + state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; + state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; + info!( + "Loading shared_preload_libraries took {:?}ms", + library_load_time + ); + info!("{:?}", remote_ext_metrics); + + Ok(()) + } + + /// Start the vm-monitor if directed to. The vm-monitor only runs on linux + /// because it requires cgroups. + fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + use std::env; + use tokio_util::sync::CancellationToken; + + // This token is used internally by the monitor to clean up all threads + let token = CancellationToken::new(); + + // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC + let pgconnstr = if disable_lfc_resizing { + None + } else { + Some(self.params.filecache_connstr.clone()) + }; + + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( + Box::leak(Box::new(vm_monitor::Args { + cgroup: Some(self.params.cgroup.clone()), + pgconnstr, + addr: self.params.vm_monitor_addr.clone(), + })), + token.clone(), + )); + Some(vm_monitor) + } else { + None + }; + StartVmMonitorResult { token, vm_monitor } + } else { + _ = disable_lfc_resizing; // appease unused lint on macOS + StartVmMonitorResult { } + } + } + } + + fn cleanup_after_postgres_exit(&self) -> Result { + // Maybe sync safekeepers again, to speed up next startup + let compute_state = self.state.lock().unwrap().clone(); + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { + info!("syncing safekeepers on shutdown"); + let storage_auth_token = pspec.storage_auth_token.clone(); + let lsn = self.sync_safekeepers(storage_auth_token)?; + info!("synced safekeepers at lsn {lsn}"); + } + + let mut delay_exit = false; + let mut state = self.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + self.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + + if let Err(err) = self.check_for_core_dumps() { + error!("error while checking for core dumps: {err:?}"); + } + + Ok(delay_exit) + } + /// Check that compute node has corresponding feature enabled. pub fn has_feature(&self, feature: ComputeFeature) -> bool { let state = self.state.lock().unwrap(); @@ -356,9 +828,10 @@ impl ComputeNode { fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. // If it is something different then create_dir() will error out anyway. - let _ok = fs::remove_dir_all(&self.pgdata); - fs::create_dir(&self.pgdata)?; - fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + let pgdata = &self.params.pgdata; + let _ok = fs::remove_dir_all(pgdata); + fs::create_dir(pgdata)?; + fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?; Ok(()) } @@ -423,7 +896,7 @@ impl ComputeNode { // sends an Error after finishing the tarball, we will not notice it. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata)?; + ar.unpack(&self.params.pgdata)?; // Report metrics let mut state = self.state.lock().unwrap(); @@ -546,6 +1019,7 @@ impl ComputeNode { // Fast path for sync_safekeepers. If they're already synced we get the lsn // in one roundtrip. If not, we should do a full sync_safekeepers. + #[instrument(skip_all)] pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); @@ -567,9 +1041,9 @@ impl ComputeNode { pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); - let mut sync_handle = maybe_cgexec(&self.pgbin) + let mut sync_handle = maybe_cgexec(&self.params.pgbin) .args(["--sync-safekeepers"]) - .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -626,14 +1100,14 @@ impl ComputeNode { pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; - let pgdata_path = Path::new(&self.pgdata); + let pgdata_path = Path::new(&self.params.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - self.internal_http_port, + self.params.internal_http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -733,12 +1207,15 @@ impl ComputeNode { info!("prewarming"); // Create pgdata - let pgdata = &format!("{}.warmup", self.pgdata); + let pgdata = &format!("{}.warmup", self.params.pgdata); create_pgdata(pgdata)?; // Run initdb to completion info!("running initdb"); - let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); + let initdb_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("initdb"); Command::new(initdb_bin) .args(["--pgdata", pgdata]) .output() @@ -754,7 +1231,7 @@ impl ComputeNode { // Start postgres info!("starting postgres"); - let mut pg = maybe_cgexec(&self.pgbin) + let mut pg = maybe_cgexec(&self.params.pgbin) .args(["-D", pgdata]) .spawn() .expect("cannot start postgres process"); @@ -776,19 +1253,17 @@ impl ComputeNode { Ok(()) } - /// Start Postgres as a child process and manage DBs/roles. - /// After that this will hang waiting on the postmaster process to exit. + /// Start Postgres as a child process and wait for it to start accepting + /// connections. + /// /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] - pub fn start_postgres( - &self, - storage_auth_token: Option, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let pgdata_path = Path::new(&self.pgdata); + pub fn start_postgres(&self, storage_auth_token: Option) -> Result { + let pgdata_path = Path::new(&self.params.pgdata); // Run postgres as a child process. - let mut pg = maybe_cgexec(&self.pgbin) - .args(["-D", &self.pgdata]) + let mut pg = maybe_cgexec(&self.params.pgbin) + .args(["-D", &self.params.pgdata]) .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { @@ -805,7 +1280,29 @@ impl ComputeNode { wait_for_postgres(&mut pg, pgdata_path)?; - Ok((pg, logs_handle)) + Ok(PostgresHandle { + postgres: pg, + log_collector: logs_handle, + }) + } + + /// Wait for the child Postgres process forever. In this state Ctrl+C will + /// propagate to Postgres and it will be shut down as well. + fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus { + info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit"); + + let ecode = pg_handle + .postgres + .wait() + .expect("failed to start waiting on Postgres process"); + PG_PID.store(0, Ordering::SeqCst); + + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(pg_handle.log_collector) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); + + ecode } /// Do post configuration of the already started Postgres. This function spawns a background task to @@ -915,388 +1412,6 @@ impl ComputeNode { Ok(client) } - /// Apply the spec to the running PostgreSQL instance. - /// The caller can decide to run with multiple clients in parallel, or - /// single mode. Either way, the commands executed will be the same, and - /// only commands run in different databases are parallelized. - #[instrument(skip_all)] - pub fn apply_spec_sql( - &self, - spec: Arc, - conf: Arc, - concurrency: usize, - ) -> Result<()> { - info!("Applying config with max {} concurrency", concurrency); - debug!("Config: {:?}", spec); - - let rt = tokio::runtime::Handle::current(); - rt.block_on(async { - // Proceed with post-startup configuration. Note, that order of operations is important. - let client = Self::get_maintenance_client(&conf).await?; - let spec = spec.clone(); - - let databases = get_existing_dbs_async(&client).await?; - let roles = get_existing_roles_async(&client) - .await? - .into_iter() - .map(|role| (role.name.clone(), role)) - .collect::>(); - - // Check if we need to drop subscriptions before starting the endpoint. - // - // It is important to do this operation exactly once when endpoint starts on a new branch. - // Otherwise, we may drop not inherited, but newly created subscriptions. - // - // We cannot rely only on spec.drop_subscriptions_before_start flag, - // because if for some reason compute restarts inside VM, - // it will start again with the same spec and flag value. - // - // To handle this, we save the fact of the operation in the database - // in the neon.drop_subscriptions_done table. - // If the table does not exist, we assume that the operation was never performed, so we must do it. - // If table exists, we check if the operation was performed on the current timelilne. - // - let mut drop_subscriptions_done = false; - - if spec.drop_subscriptions_before_start { - let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; - let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); - - info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); - - drop_subscriptions_done = match - client.simple_query(&query).await { - Ok(result) => { - matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) - }, - Err(e) => - { - match e.code() { - Some(&SqlState::UNDEFINED_TABLE) => false, - _ => { - // We don't expect any other error here, except for the schema/table not existing - error!("Error checking if drop subscription operation was already performed: {}", e); - return Err(e.into()); - } - } - } - } - }; - - - let jwks_roles = Arc::new( - spec.as_ref() - .local_proxy_config - .iter() - .flat_map(|it| &it.jwks) - .flatten() - .flat_map(|setting| &setting.role_names) - .cloned() - .collect::>(), - ); - - let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { - roles, - dbs: databases, - })); - - // Apply special pre drop database phase. - // NOTE: we use the code of RunInEachDatabase phase for parallelism - // and connection management, but we don't really run it in *each* database, - // only in databases, we're about to drop. - info!("Applying PerDatabase (pre-dropdb) phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - // Run the phase for each database that we're about to drop. - let db_processes = spec - .delta_operations - .iter() - .flatten() - .filter_map(move |op| { - if op.action.as_str() == "delete_db" { - Some(op.name.clone()) - } else { - None - } - }) - .map(|dbname| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - // We only need dbname field for this phase, so set other fields to dummy values - let db = DB::UserDB(Database { - name: dbname.clone(), - owner: "cloud_admin".to_string(), - options: None, - restrict_conn: false, - invalid: false, - }); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - [DropLogicalSubscriptions].to_vec(), - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - if let Err(e) = handle.await? { - // Handle the error case where the database does not exist - // We do not check whether the DB exists or not in the deletion phase, - // so we shouldn't be strict about it in pre-deletion cleanup as well. - if e.to_string().contains("does not exist") { - warn!("Error dropping subscription: {}", e); - } else { - return Err(e); - } - }; - } - - for phase in [ - CreateSuperUser, - DropInvalidDatabases, - RenameRoles, - CreateAndAlterRoles, - RenameAndDeleteDatabases, - CreateAndAlterDatabases, - CreateSchemaNeon, - ] { - info!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - info!("Applying RunInEachDatabase2 phase"); - let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); - - let db_processes = spec - .cluster - .databases - .iter() - .map(|db| DB::new(db.clone())) - // include - .chain(once(DB::SystemDB)) - .map(|db| { - let spec = spec.clone(); - let ctx = ctx.clone(); - let jwks_roles = jwks_roles.clone(); - let mut conf = conf.as_ref().clone(); - let concurrency_token = concurrency_token.clone(); - let db = db.clone(); - - debug!("Applying per-database phases for Database {:?}", &db); - - match &db { - DB::SystemDB => {} - DB::UserDB(db) => { - conf.dbname(db.name.as_str()); - } - } - - let conf = Arc::new(conf); - let mut phases = vec![ - DeleteDBRoleReferences, - ChangeSchemaPerms, - HandleAnonExtension, - ]; - - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(DropLogicalSubscriptions); - } - - let fut = Self::apply_spec_sql_db( - spec.clone(), - conf, - ctx.clone(), - jwks_roles.clone(), - concurrency_token.clone(), - db, - phases, - ); - - Ok(spawn(fut)) - }) - .collect::>>(); - - for process in db_processes.into_iter() { - let handle = process?; - handle.await??; - } - - let mut phases = vec![ - HandleOtherExtensions, - HandleNeonExtension, // This step depends on CreateSchemaNeon - CreateAvailabilityCheck, - DropRoles, - ]; - - // This step depends on CreateSchemaNeon - if spec.drop_subscriptions_before_start && !drop_subscriptions_done { - info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); - phases.push(FinalizeDropLogicalSubscriptions); - } - - for phase in phases { - debug!("Applying phase {:?}", &phase); - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - phase, - || async { Ok(&client) }, - ) - .await?; - } - - Ok::<(), anyhow::Error>(()) - })?; - - Ok(()) - } - - /// Apply SQL migrations of the RunInEachDatabase phase. - /// - /// May opt to not connect to databases that don't have any scheduled - /// operations. The function is concurrency-controlled with the provided - /// semaphore. The caller has to make sure the semaphore isn't exhausted. - async fn apply_spec_sql_db( - spec: Arc, - conf: Arc, - ctx: Arc>, - jwks_roles: Arc>, - concurrency_token: Arc, - db: DB, - subphases: Vec, - ) -> Result<()> { - let _permit = concurrency_token.acquire().await?; - - let mut client_conn = None; - - for subphase in subphases { - apply_operations( - spec.clone(), - ctx.clone(), - jwks_roles.clone(), - RunInEachDatabase { - db: db.clone(), - subphase, - }, - // Only connect if apply_operation actually wants a connection. - // It's quite possible this database doesn't need any queries, - // so by not connecting we save time and effort connecting to - // that database. - || async { - if client_conn.is_none() { - let db_client = Self::get_maintenance_client(&conf).await?; - client_conn.replace(db_client); - } - let client = client_conn.as_ref().unwrap(); - Ok(client) - }, - ) - .await?; - } - - drop(client_conn); - - Ok::<(), anyhow::Error>(()) - } - - /// Choose how many concurrent connections to use for applying the spec changes. - pub fn max_service_connections( - &self, - compute_state: &ComputeState, - spec: &ComputeSpec, - ) -> usize { - // If the cluster is in Init state we don't have to deal with user connections, - // and can thus use all `max_connections` connection slots. However, that's generally not - // very efficient, so we generally still limit it to a smaller number. - if compute_state.status == ComputeStatus::Init { - // If the settings contain 'max_connections', use that as template - if let Some(config) = spec.cluster.settings.find("max_connections") { - config.parse::().ok() - } else { - // Otherwise, try to find the setting in the postgresql_conf string - spec.cluster - .postgresql_conf - .iter() - .flat_map(|conf| conf.split("\n")) - .filter_map(|line| { - if !line.contains("max_connections") { - return None; - } - - let (key, value) = line.split_once("=")?; - let key = key - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - let value = value - .trim_start_matches(char::is_whitespace) - .trim_end_matches(char::is_whitespace); - - if key != "max_connections" { - return None; - } - - value.parse::().ok() - }) - .next() - } - // If max_connections is present, use at most 1/3rd of that. - // When max_connections is lower than 30, try to use at least 10 connections, but - // never more than max_connections. - .map(|limit| match limit { - 0..10 => limit, - 10..30 => 10, - 30.. => limit / 3, - }) - // If we didn't find max_connections, default to 10 concurrent connections. - .unwrap_or(10) - } else { - // state == Running - // Because the cluster is already in the Running state, we should assume users are - // already connected to the cluster, and high concurrency could negatively - // impact user connectivity. Therefore, we can limit concurrency to the number of - // reserved superuser connections, which users wouldn't be able to use anyway. - spec.cluster - .settings - .find("superuser_reserved_connections") - .iter() - .filter_map(|val| val.parse::().ok()) - .map(|val| if val > 1 { val - 1 } else { 1 }) - .last() - .unwrap_or(3) - } - } - /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -1317,7 +1432,7 @@ impl ComputeNode { // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; - if let Some(ref local_proxy) = &spec.clone().local_proxy_config { + if let Some(local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); local_proxy::configure(local_proxy).context("apply_config local_proxy")?; } @@ -1354,9 +1469,12 @@ impl ComputeNode { // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { - let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); + let pgctl_bin = Path::new(&self.params.pgbin) + .parent() + .unwrap() + .join("pg_ctl"); Command::new(pgctl_bin) - .args(["reload", "-D", &self.pgdata]) + .args(["reload", "-D", &self.params.pgdata]) .output() .expect("cannot run pg_ctl process"); Ok(()) @@ -1396,9 +1514,9 @@ impl ComputeNode { } // Write new config - let pgdata_path = Path::new(&self.pgdata); + let pgdata_path = Path::new(&self.params.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; @@ -1409,7 +1527,8 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = + tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); conf.application_name("apply_config"); let conf = Arc::new(conf); @@ -1435,164 +1554,37 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute( - &self, - ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { - let compute_state = self.state.lock().unwrap().clone(); + pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), - pspec.spec.operation_uuid.as_deref().unwrap_or("None"), - pspec.tenant_id, - pspec.timeline_id, - ); - // tune pgbouncer - if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { - info!("tuning pgbouncer"); - - // Spawn a background task to do the tuning, - // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; - if let Err(err) = res { - error!("error while tuning pgbouncer: {err:?}"); - } - }); - } - - if let Some(local_proxy) = &pspec.spec.local_proxy_config { - info!("configuring local_proxy"); - - // Spawn a background task to do the configuration, - // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); - let _handle = tokio::spawn(async move { - if let Err(err) = local_proxy::configure(&local_proxy) { - error!("error while configuring local_proxy: {err:?}"); - } - }); - } - - info!( - "start_compute spec.remote_extensions {:?}", - pspec.spec.remote_extensions - ); - - // This part is sync, because we need to download - // remote shared_preload_libraries before postgres start (if any) - if let Some(remote_extensions) = &pspec.spec.remote_extensions { - // First, create control files for all availale extensions - extension_server::create_control_files(remote_extensions, &self.pgbin); - - let library_load_start_time = Utc::now(); - let rt = tokio::runtime::Handle::current(); - let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; - - let library_load_time = Utc::now() - .signed_duration_since(library_load_start_time) - .to_std() - .unwrap() - .as_millis() as u64; - let mut state = self.state.lock().unwrap(); - state.metrics.load_ext_ms = library_load_time; - state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; - state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; - state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; - info!( - "Loading shared_preload_libraries took {:?}ms", - library_load_time - ); - info!("{:?}", remote_ext_metrics); - } - - self.prepare_pgdata(&compute_state)?; - - let start_time = Utc::now(); - let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; - - let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary { - if !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override( - pgdata_path, - "neon.max_cluster_size=-1", - || { - self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - }, - )?; - - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - if config::line_in_file( - &postgresql_conf_path, - "neon.disable_logical_replication_subscribers=false", - )? { - info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); - } + assert!(pspec.spec.mode == ComputeMode::Primary); + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.params.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; + + self.apply_config(compute_state)?; + + Ok(()) + })?; + + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + if config::line_in_file( + &postgresql_conf_path, + "neon.disable_logical_replication_subscribers=false", + )? { + info!( + "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false" + ); } - self.post_apply_config()?; - - let conf = self.get_conn_conf(None); - tokio::task::spawn_blocking(|| { - let res = get_installed_extensions(conf); - match res { - Ok(extensions) => { - info!( - "[NEON_EXT_STAT] {}", - serde_json::to_string(&extensions) - .expect("failed to serialize extensions list") - ); - } - Err(err) => error!("could not get installed extensions: {err:?}"), - } - }); + self.pg_reload_conf()?; } + self.post_apply_config()?; - let startup_end_time = Utc::now(); - { - let mut state = self.state.lock().unwrap(); - state.metrics.start_postgres_ms = config_time - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.config_ms = startup_end_time - .signed_duration_since(config_time) - .to_std() - .unwrap() - .as_millis() as u64; - state.metrics.total_startup_ms = startup_end_time - .signed_duration_since(compute_state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - } - self.set_status(ComputeStatus::Running); - - info!( - "finished configuration of compute for project {}", - pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None") - ); - - // Log metrics so that we can search for slow operations in logs - let metrics = { - let state = self.state.lock().unwrap(); - state.metrics.clone() - }; - info!(?metrics, "compute start finished"); - - Ok(pg_process) + Ok(()) } /// Update the `last_active` in the shared state, but ensure that it's a more recent one. @@ -1621,7 +1613,7 @@ impl ComputeNode { pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), - _ => Path::new(&self.pgdata), + _ => Path::new(&self.params.pgdata), }; // Collect core dump paths if any @@ -1651,7 +1643,7 @@ impl ComputeNode { // Try first with gdb let backtrace = Command::new("gdb") - .args(["--batch", "-q", "-ex", "bt", &self.pgbin]) + .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin]) .arg(&core_path) .output(); @@ -1728,7 +1720,8 @@ LIMIT 100", ext_path: RemotePath, ) -> Result { let ext_remote_storage = - self.ext_remote_storage + self.params + .ext_remote_storage .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1764,7 +1757,9 @@ LIMIT 100", info!("extension already downloaded, skipping re-download"); return Ok(0); } else if start_time_delta < HANG_TIMEOUT && !first_try { - info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout"); + info!( + "download {ext_archive_name} already started by another process, hanging untill completion or timeout" + ); let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500)); loop { info!("waiting for download"); @@ -1789,7 +1784,7 @@ LIMIT 100", &real_ext_name, &ext_path, ext_remote_storage, - &self.pgbin, + &self.params.pgbin, ) .await .map_err(DownloadError::Other); @@ -1897,7 +1892,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.ext_remote_storage.is_none() { + if self.params.ext_remote_storage.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, @@ -1948,8 +1943,12 @@ LIMIT 100", let mut download_tasks = Vec::new(); for library in &libs_vec { - let (ext_name, ext_path) = - remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?; + let (ext_name, ext_path) = remote_extensions.get_ext( + library, + true, + &self.params.build_tag, + &self.params.pgversion, + )?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; @@ -2026,3 +2025,26 @@ pub fn forward_termination_signal() { kill(pg_pid, Signal::SIGINT).ok(); } } + +// helper trait to call JoinSet::spawn_blocking(f), but propagates the current +// tracing span to the thread. +trait JoinSetExt { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send; +} + +impl JoinSetExt for tokio::task::JoinSet { + fn spawn_blocking_child(&mut self, f: F) -> tokio::task::AbortHandle + where + F: FnOnce() -> T + Send + 'static, + T: Send, + { + let sp = tracing::Span::current(); + self.spawn_blocking(move || { + let _e = sp.enter(); + f() + }) + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e1bdfffa54..0760568ff8 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -1,13 +1,16 @@ +use anyhow::Result; +use std::fmt::Write as FmtWrite; use std::fs::{File, OpenOptions}; use std::io; +use std::io::Write; use std::io::prelude::*; use std::path::Path; -use anyhow::Result; +use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; -use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; -use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; +use crate::pg_helpers::{ + GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, +}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -56,10 +59,20 @@ pub fn write_postgres_conf( writeln!(file, "neon.stripe_size={stripe_size}")?; } if !spec.safekeeper_connstrings.is_empty() { + let mut neon_safekeepers_value = String::new(); + tracing::info!( + "safekeepers_connstrings is not zero, gen: {:?}", + spec.safekeepers_generation + ); + // If generation is given, prepend sk list with g#number: + if let Some(generation) = spec.safekeepers_generation { + write!(neon_safekeepers_value, "g#{}:", generation)?; + } + neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( file, "neon.safekeepers={}", - escape_conf_value(&spec.safekeeper_connstrings.join(",")) + escape_conf_value(&neon_safekeepers_value) )?; } if let Some(s) = &spec.tenant_id { @@ -127,6 +140,54 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } + // If audit logging is enabled, configure pgaudit. + // + // Note, that this is called after the settings from spec are written. + // This way we always override the settings from the spec + // and don't allow the user or the control plane admin to change them. + if let ComputeAudit::Hipaa = spec.audit_log_level { + writeln!(file, "# Managed by compute_ctl audit settings: begin")?; + // This log level is very verbose + // but this is necessary for HIPAA compliance. + writeln!(file, "pgaudit.log='all'")?; + writeln!(file, "pgaudit.log_parameter=on")?; + // Disable logging of catalog queries + // The catalog doesn't contain sensitive data, so we don't need to audit it. + writeln!(file, "pgaudit.log_catalog=off")?; + // Set log rotation to 5 minutes + // TODO: tune this after performance testing + writeln!(file, "pgaudit.log_rotation_age=5")?; + + // Add audit shared_preload_libraries, if they are not present. + // + // The caller who sets the flag is responsible for ensuring that the necessary + // shared_preload_libraries are present in the compute image, + // otherwise the compute start will fail. + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + let mut extra_shared_preload_libraries = String::new(); + if !libs.contains("pgaudit") { + extra_shared_preload_libraries.push_str(",pgaudit"); + } + if !libs.contains("pgauditlogtofile") { + extra_shared_preload_libraries.push_str(",pgauditlogtofile"); + } + writeln!( + file, + "shared_preload_libraries='{}{}'", + libs, extra_shared_preload_libraries + )?; + } else { + // Typically, this should be unreacheable, + // because we always set at least some shared_preload_libraries in the spec + // but let's handle it explicitly anyway. + writeln!( + file, + "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'" + )?; + } + writeln!(file, "# Managed by compute_ctl audit settings: end")?; + } + writeln!(file, "neon.extension_server_port={}", extension_server_port)?; if spec.drop_subscriptions_before_start { diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf new file mode 100644 index 0000000000..bef3c36446 --- /dev/null +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -0,0 +1,10 @@ +# Load imfile module to read log files +module(load="imfile") + +# Input configuration for log files in the specified directory +# Replace {log_directory} with the directory containing the log files +input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") +global(workDirectory="/var/log") + +# Forward logs to remote syslog server +*.* @@{remote_endpoint} \ No newline at end of file diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index d88f26ca20..d97bd37285 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -1,9 +1,8 @@ use std::sync::Arc; use std::thread; -use tracing::{error, info, instrument}; - use compute_api::responses::ComputeStatus; +use tracing::{error, info, instrument}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs index e838c5b9fd..1353ab938d 100644 --- a/compute_tools/src/disk_quota.rs +++ b/compute_tools/src/disk_quota.rs @@ -1,9 +1,11 @@ use anyhow::Context; +use tracing::instrument; pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; /// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. /// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +#[instrument] pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { let size_kb = size_bytes / 1024; // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 00f46386e7..ee889e0c40 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,15 +71,15 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::Result; -use anyhow::{bail, Context}; +use std::path::Path; +use std::str; + +use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; -use std::path::Path; -use std::str; use tar::Archive; use tracing::info; use tracing::log::warn; @@ -202,8 +202,24 @@ pub async fn download_extension( // move contents of the libdir / sharedir in unzipped archive to the correct local paths for paths in [sharedir_paths, libdir_paths] { let (zip_dir, real_dir) = paths; + + let dir = match std::fs::read_dir(&zip_dir) { + Ok(dir) => dir, + Err(e) => match e.kind() { + // In the event of a SQL-only extension, there would be nothing + // to move from the lib/ directory, so note that in the log and + // move on. + std::io::ErrorKind::NotFound => { + info!("nothing to move from {}", zip_dir); + continue; + } + _ => return Err(anyhow::anyhow!(e)), + }, + }; + info!("mv {zip_dir:?}/* {real_dir:?}"); - for file in std::fs::read_dir(zip_dir)? { + + for file in dir { let old_file = file?.path(); let new_file = Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?); @@ -244,33 +260,40 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { info!("writing file {:?}{:?}", control_path, control_content); std::fs::write(control_path, control_content).unwrap(); } else { - warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path); + warn!( + "control file {:?} exists both locally and remotely. ignoring the remote version.", + control_path + ); } } } } -// Do request to extension storage proxy, i.e. +// Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst -// using HHTP GET -// and return the response body as bytes -// +// using HTTP GET and return the response body as bytes. async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { let uri = format!("{}/{}", ext_remote_storage, ext_path); + let filename = Path::new(ext_path) + .file_name() + .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) + .to_str() + .unwrap_or("unknown") + .to_string(); - info!("Download extension {} from uri {}", ext_path, uri); + info!("Downloading extension file '{}' from uri {}", filename, uri); match do_extension_server_request(&uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&StatusCode::OK.to_string()]) + .with_label_values(&[&StatusCode::OK.to_string(), &filename]) .inc(); Ok(resp) } Err((msg, status)) => { REMOTE_EXT_REQUESTS_TOTAL - .with_label_values(&[&status]) + .with_label_values(&[&status, &filename]) .inc(); bail!(msg); } diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs index 104cc25d5f..1d32e4ff37 100644 --- a/compute_tools/src/http/extract/json.rs +++ b/compute_tools/src/http/extract/json.rs @@ -1,6 +1,7 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::JsonRejection, FromRequest, Request}; +use axum::extract::rejection::JsonRejection; +use axum::extract::{FromRequest, Request}; use compute_api::responses::GenericAPIError; use http::StatusCode; diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs index 1b690e444d..589681cfe2 100644 --- a/compute_tools/src/http/extract/mod.rs +++ b/compute_tools/src/http/extract/mod.rs @@ -1,7 +1,9 @@ pub(crate) mod json; pub(crate) mod path; pub(crate) mod query; +pub(crate) mod request_id; pub(crate) use json::Json; pub(crate) use path::Path; pub(crate) use query::Query; +pub(crate) use request_id::RequestId; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs index 09637a96a4..45970cff3d 100644 --- a/compute_tools/src/http/extract/path.rs +++ b/compute_tools/src/http/extract/path.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::PathRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::PathRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Path` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs index 9dec3642cf..b8079ea770 100644 --- a/compute_tools/src/http/extract/query.rs +++ b/compute_tools/src/http/extract/query.rs @@ -1,8 +1,10 @@ use std::ops::{Deref, DerefMut}; -use axum::extract::{rejection::QueryRejection, FromRequestParts}; +use axum::extract::FromRequestParts; +use axum::extract::rejection::QueryRejection; use compute_api::responses::GenericAPIError; -use http::{request::Parts, StatusCode}; +use http::StatusCode; +use http::request::Parts; /// Custom `Query` extractor, so that we can format errors into /// `JsonResponse`. diff --git a/compute_tools/src/http/extract/request_id.rs b/compute_tools/src/http/extract/request_id.rs new file mode 100644 index 0000000000..d911921a05 --- /dev/null +++ b/compute_tools/src/http/extract/request_id.rs @@ -0,0 +1,86 @@ +use std::{ + fmt::Display, + ops::{Deref, DerefMut}, +}; + +use axum::{extract::FromRequestParts, response::IntoResponse}; +use http::{StatusCode, request::Parts}; + +use crate::http::{JsonResponse, headers::X_REQUEST_ID}; + +/// Extract the request ID from the `X-Request-Id` header. +#[derive(Debug, Clone, Default)] +pub(crate) struct RequestId(pub String); + +#[derive(Debug)] +/// Rejection used for [`RequestId`]. +/// +/// Contains one variant for each way the [`RequestId`] extractor can +/// fail. +pub(crate) enum RequestIdRejection { + /// The request is missing the header. + MissingRequestId, + + /// The value of the header is invalid UTF-8. + InvalidUtf8, +} + +impl RequestIdRejection { + pub fn status(&self) -> StatusCode { + match self { + RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR, + RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST, + } + } + + pub fn message(&self) -> String { + match self { + RequestIdRejection::MissingRequestId => "request ID is missing", + RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8", + } + .to_string() + } +} + +impl IntoResponse for RequestIdRejection { + fn into_response(self) -> axum::response::Response { + JsonResponse::error(self.status(), self.message()) + } +} + +impl FromRequestParts for RequestId +where + S: Send + Sync, +{ + type Rejection = RequestIdRejection; + + async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result { + match parts.headers.get(X_REQUEST_ID) { + Some(value) => match value.to_str() { + Ok(request_id) => Ok(Self(request_id.to_string())), + Err(_) => Err(RequestIdRejection::InvalidUtf8), + }, + None => Err(RequestIdRejection::MissingRequestId), + } + } +} + +impl Deref for RequestId { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RequestId { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Display for RequestId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.0) + } +} diff --git a/compute_tools/src/http/headers.rs b/compute_tools/src/http/headers.rs new file mode 100644 index 0000000000..a11638e203 --- /dev/null +++ b/compute_tools/src/http/headers.rs @@ -0,0 +1,2 @@ +/// Constant for `X-Request-Id` header. +pub const X_REQUEST_ID: &str = "x-request-id"; diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs new file mode 100644 index 0000000000..798dd1179b --- /dev/null +++ b/compute_tools/src/http/middleware/authorize.rs @@ -0,0 +1,145 @@ +use std::{collections::HashSet, net::SocketAddr}; + +use anyhow::{Result, anyhow}; +use axum::{RequestExt, body::Body, extract::ConnectInfo}; +use axum_extra::{ + TypedHeader, + headers::{Authorization, authorization::Bearer}, +}; +use futures::future::BoxFuture; +use http::{Request, Response, StatusCode}; +use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; +use serde::Deserialize; +use tower_http::auth::AsyncAuthorizeRequest; +use tracing::warn; + +use crate::http::{JsonResponse, extract::RequestId}; + +#[derive(Clone, Debug, Deserialize)] +pub(in crate::http) struct Claims { + compute_id: String, +} + +#[derive(Clone, Debug)] +pub(in crate::http) struct Authorize { + compute_id: String, + jwks: JwkSet, + validation: Validation, +} + +impl Authorize { + pub fn new(compute_id: String, jwks: JwkSet) -> Self { + let mut validation = Validation::new(Algorithm::EdDSA); + // Nothing is currently required + validation.required_spec_claims = HashSet::new(); + validation.validate_exp = true; + // Unused by the control plane + validation.validate_aud = false; + // Unused by the control plane + validation.validate_nbf = false; + + Self { + compute_id, + jwks, + validation, + } + } +} + +impl AsyncAuthorizeRequest for Authorize { + type RequestBody = Body; + type ResponseBody = Body; + type Future = BoxFuture<'static, Result, Response>>; + + fn authorize(&mut self, mut request: Request) -> Self::Future { + let compute_id = self.compute_id.clone(); + let jwks = self.jwks.clone(); + let validation = self.validation.clone(); + + Box::pin(async move { + let request_id = request.extract_parts::().await.unwrap(); + + // TODO: Remove this check after a successful rollout + if jwks.keys.is_empty() { + warn!(%request_id, "Authorization has not been configured"); + + return Ok(request); + } + + let connect_info = request + .extract_parts::>() + .await + .unwrap(); + + // In the event the request is coming from the loopback interface, + // allow all requests + if connect_info.ip().is_loopback() { + warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface"); + + return Ok(request); + } + + let TypedHeader(Authorization(bearer)) = request + .extract_parts::>>() + .await + .map_err(|_| { + JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token") + })?; + + let data = match Self::verify(&jwks, bearer.token(), &validation) { + Ok(claims) => claims, + Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), + }; + + if data.claims.compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid claims in authorization token", + )); + } + + // Make claims available to any subsequent middleware or request + // handlers + request.extensions_mut().insert(data.claims); + + Ok(request) + }) + } +} + +impl Authorize { + /// Verify the token using the JSON Web Key set and return the token data. + fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { + debug_assert!(!jwks.keys.is_empty()); + + for jwk in jwks.keys.iter() { + let decoding_key = match DecodingKey::from_jwk(jwk) { + Ok(key) => key, + Err(e) => { + warn!( + "Failed to construct decoding key from {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + }; + + match jsonwebtoken::decode::(token, &decoding_key, validation) { + Ok(data) => return Ok(data), + Err(e) => { + warn!( + "Failed to decode authorization token using {}: {}", + jwk.common.key_id.as_ref().unwrap(), + e + ); + + continue; + } + } + } + + Err(anyhow!("Failed to verify authorization token")) + } +} diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs new file mode 100644 index 0000000000..caeeeedfe5 --- /dev/null +++ b/compute_tools/src/http/middleware/mod.rs @@ -0,0 +1 @@ +pub(in crate::http) mod authorize; diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 93eb6ef5b7..9ecc1b0093 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,10 +1,14 @@ -use axum::{body::Body, response::Response}; +use axum::body::Body; +use axum::response::Response; use compute_api::responses::{ComputeStatus, GenericAPIError}; -use http::{header::CONTENT_TYPE, StatusCode}; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Serialize; use tracing::error; mod extract; +mod headers; +mod middleware; mod routes; pub mod server; diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs index d7feb055e9..5a12686fa8 100644 --- a/compute_tools/src/http/routes/check_writability.rs +++ b/compute_tools/src/http/routes/check_writability.rs @@ -1,10 +1,13 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; +use crate::checker::check_writability; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Check that the compute is currently running. pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 2546cbc344..3c5a6a6d41 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,18 +1,16 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ConfigurationRequest, - responses::{ComputeStatus, ComputeStatusResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{ComputeNode, ParsedSpec}, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::{ComputeNode, ParsedSpec}; +use crate::http::JsonResponse; +use crate::http::extract::Json; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -24,7 +22,7 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - if !compute.live_config_allowed { + if !compute.params.live_config_allowed { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "live configuration is not allowed for this compute node".to_string(), @@ -47,13 +45,18 @@ pub(in crate::http) async fn configure( return JsonResponse::invalid_status(state.status); } + // Pass the tracing span to the main thread that performs the startup, + // so that the start_compute operation is considered a child of this + // configure request for tracing purposes. + state.startup_span = Some(tracing::Span::current()); + state.pspec = Some(pspec); state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); drop(state); } // Spawn a blocking thread to wait for compute to become Running. This is - // needed to do not block the main pool of workers and be able to serve + // needed to not block the main pool of workers and to be able to serve // other requests while some particular request is waiting for compute to // finish configuration. let c = compute.clone(); diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs index fd716272dc..1f6ca4b79d 100644 --- a/compute_tools/src/http/routes/database_schema.rs +++ b/compute_tools/src/http/routes/database_schema.rs @@ -1,14 +1,16 @@ use std::sync::Arc; -use axum::{body::Body, extract::State, response::Response}; -use http::{header::CONTENT_TYPE, StatusCode}; +use axum::body::Body; +use axum::extract::State; +use axum::response::Response; +use http::StatusCode; +use http::header::CONTENT_TYPE; use serde::Deserialize; -use crate::{ - catalog::{get_database_schema, SchemaDumpError}, - compute::ComputeNode, - http::{extract::Query, JsonResponse}, -}; +use crate::catalog::{SchemaDumpError, get_database_schema}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Query; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct DatabaseSchemaParams { diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs index 4843c3fab4..790fe0dfe3 100644 --- a/compute_tools/src/http/routes/dbs_and_roles.rs +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -1,9 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; +use crate::catalog::get_dbs_and_roles; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get the databases and roles from the compute. pub(in crate::http) async fn get_catalog_objects( diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 5cc9b6d277..563b73ae65 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -1,19 +1,13 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::{ - compute::ComputeNode, - http::{ - extract::{Path, Query}, - JsonResponse, - }, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::{Path, Query}; #[derive(Debug, Clone, Deserialize)] pub(in crate::http) struct ExtensionServerParams { @@ -24,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams { /// Download a remote extension. pub(in crate::http) async fn download_extension( Path(filename): Path, - params: Query, + ext_server_params: Query, State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.ext_remote_storage.is_none() { + if compute.params.ext_remote_storage.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", @@ -52,9 +46,9 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, - params.is_library, - &compute.build_tag, - &compute.pgversion, + ext_server_params.is_library, + &compute.params.build_tag, + &compute.params.pgversion, ) }; diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs index 1fc03b9109..910e1fa155 100644 --- a/compute_tools/src/http/routes/extensions.rs +++ b/compute_tools/src/http/routes/extensions.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::ExtensionInstallRequest, - responses::{ComputeStatus, ExtensionInstallResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::ExtensionInstallRequest; +use compute_api::responses::{ComputeStatus, ExtensionInstallResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Install a extension. pub(in crate::http) async fn install_extension( diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 836417d784..8f5da99963 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -17,7 +17,8 @@ pub struct FailpointConfig { pub actions: String, } -use crate::http::{extract::Json, JsonResponse}; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Configure failpoints for testing purposes. pub(in crate::http) async fn configure_failpoints( diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs index 3f67f011e5..267dcbb27e 100644 --- a/compute_tools/src/http/routes/grants.rs +++ b/compute_tools/src/http/routes/grants.rs @@ -1,16 +1,14 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; -use compute_api::{ - requests::SetRoleGrantsRequest, - responses::{ComputeStatus, SetRoleGrantsResponse}, -}; +use axum::extract::State; +use axum::response::Response; +use compute_api::requests::SetRoleGrantsRequest; +use compute_api::responses::{ComputeStatus, SetRoleGrantsResponse}; use http::StatusCode; -use crate::{ - compute::ComputeNode, - http::{extract::Json, JsonResponse}, -}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; +use crate::http::extract::Json; /// Add grants for a role. pub(in crate::http) async fn add_grant( diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs index 6b03a461c3..b1ba67161e 100644 --- a/compute_tools/src/http/routes/insights.rs +++ b/compute_tools/src/http/routes/insights.rs @@ -1,10 +1,12 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use compute_api::responses::ComputeStatus; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Collect current Postgres usage insights. pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs index 13150a7588..da8d8b20a5 100644 --- a/compute_tools/src/http/routes/metrics.rs +++ b/compute_tools/src/http/routes/metrics.rs @@ -1,10 +1,12 @@ -use axum::{body::Body, response::Response}; -use http::header::CONTENT_TYPE; +use axum::body::Body; +use axum::response::Response; use http::StatusCode; +use http::header::CONTENT_TYPE; use metrics::proto::MetricFamily; use metrics::{Encoder, TextEncoder}; -use crate::{http::JsonResponse, metrics::collect}; +use crate::http::JsonResponse; +use crate::metrics::collect; /// Expose Prometheus metrics. pub(in crate::http) async fn get_metrics() -> Response { diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs index 0709db5011..bc35ee2645 100644 --- a/compute_tools/src/http/routes/metrics_json.rs +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -1,9 +1,11 @@ use std::sync::Arc; -use axum::{extract::State, response::Response}; +use axum::extract::State; +use axum::response::Response; use http::StatusCode; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Get startup metrics. pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs index d64d53a58f..8ed1299d6b 100644 --- a/compute_tools/src/http/routes/status.rs +++ b/compute_tools/src/http/routes/status.rs @@ -1,9 +1,13 @@ -use std::{ops::Deref, sync::Arc}; +use std::ops::Deref; +use std::sync::Arc; -use axum::{extract::State, http::StatusCode, response::Response}; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::Response; use compute_api::responses::ComputeStatusResponse; -use crate::{compute::ComputeNode, http::JsonResponse}; +use crate::compute::ComputeNode; +use crate::http::JsonResponse; /// Retrieve the state of the comute. pub(in crate::http) async fn get_status(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 7acd84f236..2c24d4ad6b 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -1,18 +1,14 @@ use std::sync::Arc; -use axum::{ - extract::State, - response::{IntoResponse, Response}, -}; +use axum::extract::State; +use axum::response::{IntoResponse, Response}; use compute_api::responses::ComputeStatus; use http::StatusCode; use tokio::task; use tracing::info; -use crate::{ - compute::{forward_termination_signal, ComputeNode}, - http::JsonResponse, -}; +use crate::compute::{ComputeNode, forward_termination_signal}; +use crate::http::JsonResponse; /// Terminate the compute. pub(in crate::http) async fn terminate(State(compute): State>) -> Response { diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index a523ecd96f..126fa86d1c 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,60 +1,67 @@ -use std::{ - fmt::Display, - net::{IpAddr, Ipv6Addr, SocketAddr}, - sync::Arc, - time::Duration, -}; +use std::fmt::Display; +use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Result; -use axum::{ - extract::Request, - middleware::{self, Next}, - response::{IntoResponse, Response}, - routing::{get, post}, - Router, -}; +use axum::Router; +use axum::extract::Request; +use axum::middleware::{self, Next}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{get, post}; use http::StatusCode; +use jsonwebtoken::jwk::JwkSet; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; -use tracing::{debug, error, info, Span}; +use tower_http::{ + auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, +}; +use tracing::{Span, error, info}; use uuid::Uuid; -use super::routes::{ - check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, +use super::{ + headers::X_REQUEST_ID, + middleware::authorize::Authorize, + routes::{ + check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, + grants, insights, metrics, metrics_json, status, terminate, + }, }; use crate::compute::ComputeNode; -const X_REQUEST_ID: &str = "x-request-id"; - /// `compute_ctl` has two servers: internal and external. The internal server /// binds to the loopback interface and handles communication from clients on /// the compute. The external server is what receives communication from the /// control plane, the metrics scraper, etc. We make the distinction because /// certain routes in `compute_ctl` only need to be exposed to local processes /// like Postgres via the neon extension and local_proxy. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] pub enum Server { - Internal(u16), - External(u16), + Internal { + port: u16, + }, + External { + port: u16, + jwks: JwkSet, + compute_id: String, + }, } impl Display for Server { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Server::Internal(_) => f.write_str("internal"), - Server::External(_) => f.write_str("external"), + Server::Internal { .. } => f.write_str("internal"), + Server::External { .. } => f.write_str("external"), } } } -impl From for Router> { - fn from(server: Server) -> Self { +impl From<&Server> for Router> { + fn from(server: &Server) -> Self { let mut router = Router::>::new(); router = match server { - Server::Internal(_) => { + Server::Internal { .. } => { router = router .route( "/extension_server/{*filename}", @@ -72,58 +79,71 @@ impl From for Router> { router } - Server::External(_) => router - .route("/check_writability", post(check_writability::is_writable)) - .route("/configure", post(configure::configure)) - .route("/database_schema", get(database_schema::get_schema_dump)) - .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) - .route("/insights", get(insights::get_insights)) - .route("/metrics", get(metrics::get_metrics)) - .route("/metrics.json", get(metrics_json::get_metrics)) - .route("/status", get(status::get_status)) - .route("/terminate", post(terminate::terminate)), + Server::External { + jwks, compute_id, .. + } => { + let unauthenticated_router = + Router::>::new().route("/metrics", get(metrics::get_metrics)); + + let authenticated_router = Router::>::new() + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)) + .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( + compute_id.clone(), + jwks.clone(), + ))); + + router + .merge(unauthenticated_router) + .merge(authenticated_router) + } }; - router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( - ServiceBuilder::new() - // Add this middleware since we assume the request ID exists - .layer(middleware::from_fn(maybe_add_request_id_header)) - .layer( - TraceLayer::new_for_http() - .on_request(|request: &http::Request<_>, _span: &Span| { - let request_id = request - .headers() - .get(X_REQUEST_ID) - .unwrap() - .to_str() - .unwrap(); - - match request.uri().path() { - "/metrics" => { - debug!(%request_id, "{} {}", request.method(), request.uri()) - } - _ => info!(%request_id, "{} {}", request.method(), request.uri()), - }; - }) - .on_response( - |response: &http::Response<_>, latency: Duration, _span: &Span| { - let request_id = response + router + .fallback(Server::handle_404) + .method_not_allowed_fallback(Server::handle_405) + .layer( + ServiceBuilder::new() + .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO)) + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) + .layer( + TraceLayer::new_for_http() + .on_request(|request: &http::Request<_>, _span: &Span| { + let request_id = request .headers() .get(X_REQUEST_ID) .unwrap() .to_str() .unwrap(); - info!( - %request_id, - code = response.status().as_u16(), - latency = latency.as_millis() - ) - }, - ), - ) - .layer(PropagateRequestIdLayer::x_request_id()), - ) + info!(%request_id, "{} {}", request.method(), request.uri()); + }) + .on_response( + |response: &http::Response<_>, latency: Duration, _span: &Span| { + let request_id = response + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + info!( + %request_id, + code = response.status().as_u16(), + latency = latency.as_millis() + ); + }, + ), + ) + .layer(PropagateRequestIdLayer::x_request_id()), + ) } } @@ -147,15 +167,15 @@ impl Server { match self { // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners // allow binding to localhost - Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), - Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED), } } - fn port(self) -> u16 { + fn port(&self) -> u16 { match self { - Server::Internal(port) => port, - Server::External(port) => port, + Server::Internal { port, .. } => *port, + Server::External { port, .. } => *port, } } @@ -182,7 +202,9 @@ impl Server { ); } - let router = Router::from(self).with_state(compute); + let router = Router::from(&self) + .with_state(compute) + .into_make_service_with_connect_info::(); if let Err(e) = axum::serve(listener, router).await { error!("compute_ctl {} HTTP server error: {}", self, e); diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 173dbf40b0..6921505466 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,7 +1,7 @@ -use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use anyhow::Result; +use compute_api::responses::{InstalledExtension, InstalledExtensions}; use postgres::{Client, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index b08df22134..5c78bbcd02 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -21,6 +21,7 @@ mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; +pub mod rsyslog; pub mod spec; mod spec_apply; pub mod swap; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 3749dfc844..a65614e94e 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; +use tracing::info; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result pub fn inlinify(s: &str) -> String { s.replace('\n', "\u{200B}") } + +pub fn startup_context_from_env() -> Option { + // Extract OpenTelemetry context for the startup actions from the + // TRACEPARENT and TRACESTATE env variables, and attach it to the current + // tracing context. + // + // This is used to propagate the context for the 'start_compute' operation + // from the neon control plane. This allows linking together the wider + // 'start_compute' operation that creates the compute container, with the + // startup actions here within the container. + // + // There is no standard for passing context in env variables, but a lot of + // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See + // https://github.com/open-telemetry/opentelemetry-specification/issues/740 + // + // Switch to the startup context here, and exit it once the startup has + // completed and Postgres is up and running. + // + // If this pod is pre-created without binding it to any particular endpoint + // yet, this isn't the right place to enter the startup context. In that + // case, the control plane should pass the tracing context as part of the + // /configure API call. + // + // NOTE: This is supposed to only cover the *startup* actions. Once + // postgres is configured and up-and-running, we exit this span. Any other + // actions that are performed on incoming HTTP requests, for example, are + // performed in separate spans. + // + // XXX: If the pod is restarted, we perform the startup actions in the same + // context as the original startup actions, which probably doesn't make + // sense. + let mut startup_tracing_carrier: HashMap = HashMap::new(); + if let Ok(val) = std::env::var("TRACEPARENT") { + startup_tracing_carrier.insert("traceparent".to_string(), val); + } + if let Ok(val) = std::env::var("TRACESTATE") { + startup_tracing_carrier.insert("tracestate".to_string(), val); + } + if !startup_tracing_carrier.is_empty() { + use opentelemetry::propagation::TextMapPropagator; + use opentelemetry_sdk::propagation::TraceContextPropagator; + info!("got startup tracing context from env variables"); + Some(TraceContextPropagator::new().extract(&startup_tracing_carrier)) + } else { + None + } +} diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3061d387a5..b4ec675ff4 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -1,17 +1,15 @@ -use anyhow::bail; -use anyhow::Result; -use postgres::{NoTls, SimpleQueryMessage}; -use std::time::SystemTime; -use std::{str::FromStr, sync::Arc, thread, time::Duration}; -use utils::id::TenantId; -use utils::id::TimelineId; +use std::str::FromStr; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, SystemTime}; +use anyhow::{Result, bail}; use compute_api::spec::ComputeMode; +use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber, TenantShardId}, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use crate::compute::ComputeNode; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 870b294d08..dab32d5dc1 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,6 @@ use metrics::core::Collector; use metrics::proto::MetricFamily; -use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { @@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| register_int_counter_vec!( "compute_ctl_remote_ext_requests_total", "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", - // Do not use any labels like extension name yet. - // We can add them later if needed. - &["http_status"] + &["http_status", "filename"] ) .expect("failed to define a metric") }); diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 184f380a8d..83318538cd 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,13 +1,14 @@ use std::sync::Arc; -use std::{thread, time::Duration}; +use std::thread; +use std::time::Duration; use chrono::{DateTime, Utc}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeFeature; use postgres::{Client, NoTls}; use tracing::{debug, error, info, warn}; use crate::compute::ComputeNode; -use compute_api::responses::ComputeStatus; -use compute_api::spec::ComputeFeature; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); @@ -17,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.clone(); + let connstr = compute.params.connstr.clone(); let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor")); // During startup and configuration we connect to every Postgres database, diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 86fcf99085..dd8d8e9b8b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -9,7 +9,8 @@ use std::process::Child; use std::str::FromStr; use std::time::{Duration, Instant}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; @@ -21,8 +22,6 @@ use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; -use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; - const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal. @@ -187,15 +186,40 @@ impl DatabaseExt for Database { /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { fn pg_quote(&self) -> String; + fn pg_quote_dollar(&self) -> (String, String); } impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it /// always quotes provided string with `""` and escapes every `"`. /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. + /// N.B. it's not useful for escaping identifiers that are used inside WHERE + /// clause, use `escape_literal()` instead. fn pg_quote(&self) -> String { - let result = format!("\"{}\"", self.replace('"', "\"\"")); - result + format!("\"{}\"", self.replace('"', "\"\"")) + } + + /// This helper is intended to be used for dollar-escaping strings for usage + /// inside PL/pgSQL procedures. In addition to dollar-escaping the string, + /// it also returns a tag that is intended to be used inside the outer + /// PL/pgSQL procedure. If you do not need an outer tag, just discard it. + /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, + /// + fn pg_quote_dollar(&self) -> (String, String) { + let mut tag: String = "".to_string(); + let mut outer_tag = "x".to_string(); + + // Find the first suitable tag that is not present in the string. + // Postgres' max role/DB name length is 63 bytes, so even in the + // worst case it won't take long. + while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) { + tag += "x"; + outer_tag = tag.clone() + "x"; + } + + let escaped = format!("${tag}${self}${tag}$"); + + (escaped, outer_tag) } } @@ -227,10 +251,13 @@ pub async fn get_existing_dbs_async( // invalid state. See: // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 let rowstream = client + // We use a subquery instead of a fancy `datdba::regrole::text AS owner`, + // because the latter automatically wraps the result in double quotes, + // if the role name contains special characters. .query_raw::( "SELECT datname AS name, - datdba::regrole::text AS owner, + (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner, NOT datallowconn AS restrict_conn, datconnlimit = - 2 AS invalid FROM diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs new file mode 100644 index 0000000000..c8fba4fdcd --- /dev/null +++ b/compute_tools/src/rsyslog.rs @@ -0,0 +1,77 @@ +use std::process::Command; +use std::{fs::OpenOptions, io::Write}; + +use anyhow::{Context, Result}; +use tracing::info; + +fn get_rsyslog_pid() -> Option { + let output = Command::new("pgrep") + .arg("rsyslogd") + .output() + .expect("Failed to execute pgrep"); + + if !output.stdout.is_empty() { + let pid = std::str::from_utf8(&output.stdout) + .expect("Invalid UTF-8 in process output") + .trim() + .to_string(); + Some(pid) + } else { + None + } +} + +// Restart rsyslogd to apply the new configuration. +// This is necessary, because there is no other way to reload the rsyslog configuration. +// +// Rsyslogd shouldn't lose any messages, because of the restart, +// because it tracks the last read position in the log files +// and will continue reading from that position. +// TODO: test it properly +// +fn restart_rsyslog() -> Result<()> { + let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?; + info!("rsyslogd is running with pid: {}, restart it", old_pid); + + // kill it to restart + let _ = Command::new("pkill") + .arg("rsyslogd") + .output() + .context("Failed to stop rsyslogd")?; + + Ok(()) +} + +pub fn configure_audit_rsyslog( + log_directory: &str, + tag: &str, + remote_endpoint: &str, +) -> Result<()> { + let config_content: String = format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + tag = tag, + remote_endpoint = remote_endpoint + ); + + info!("rsyslog config_content: {}", config_content); + + let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf"; + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(rsyslog_conf_path)?; + + file.write_all(config_content.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + rsyslog_conf_path + ); + + // start the service, using the configuration + restart_rsyslog()?; + + Ok(()) +} diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6f28bd9733..1d19f2738d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,20 +1,20 @@ -use anyhow::{anyhow, bail, Result}; -use reqwest::StatusCode; use std::fs::File; use std::path::Path; -use tokio_postgres::Client; -use tracing::{error, info, instrument, warn}; - -use crate::config; -use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; -use crate::migration::MigrationRunner; -use crate::params::PG_HBA_ALL_MD5; -use crate::pg_helpers::*; +use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, }; use compute_api::spec::ComputeSpec; +use reqwest::StatusCode; +use tokio_postgres::Client; +use tracing::{error, info, instrument, warn}; + +use crate::config; +use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; +use crate::migration::MigrationRunner; +use crate::params::PG_HBA_ALL_MD5; +use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -141,7 +141,6 @@ pub fn get_spec_from_control_plane( /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("checking pg_hba.conf"); let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -156,12 +155,11 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json - info!("adding standby.signal"); let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { - info!("created standby.signal"); File::create(signalfile)?; + info!("created standby.signal"); } else { info!("reused pre-existing standby.signal"); } @@ -170,7 +168,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { #[instrument(skip_all)] pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); client.simple_query(query).await?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index c4416480d8..e5f7aebbf8 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -1,18 +1,430 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::future::Future; -use std::iter::empty; -use std::iter::once; +use std::iter::{empty, once}; use std::sync::Arc; -use crate::compute::construct_superuser_query; -use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::Result; -use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use anyhow::{Context, Result}; +use compute_api::responses::ComputeStatus; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, warn, Instrument}; +use tokio_postgres::error::SqlState; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; + +use crate::compute::{ComputeNode, ComputeState}; +use crate::pg_helpers::{ + DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async, + get_existing_roles_async, +}; +use crate::spec_apply::ApplySpecPhase::{ + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser, + CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon, + DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, + RunInEachDatabase, +}; +use crate::spec_apply::PerDatabasePhase::{ + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, +}; + +impl ComputeNode { + /// Apply the spec to the running PostgreSQL instance. + /// The caller can decide to run with multiple clients in parallel, or + /// single mode. Either way, the commands executed will be the same, and + /// only commands run in different databases are parallelized. + #[instrument(skip_all)] + pub fn apply_spec_sql( + &self, + spec: Arc, + conf: Arc, + concurrency: usize, + ) -> Result<()> { + info!("Applying config with max {} concurrency", concurrency); + debug!("Config: {:?}", spec); + + let rt = tokio::runtime::Handle::current(); + rt.block_on(async { + // Proceed with post-startup configuration. Note, that order of operations is important. + let client = Self::get_maintenance_client(&conf).await?; + let spec = spec.clone(); + + let databases = get_existing_dbs_async(&client).await?; + let roles = get_existing_roles_async(&client) + .await? + .into_iter() + .map(|role| (role.name.clone(), role)) + .collect::>(); + + // Check if we need to drop subscriptions before starting the endpoint. + // + // It is important to do this operation exactly once when endpoint starts on a new branch. + // Otherwise, we may drop not inherited, but newly created subscriptions. + // + // We cannot rely only on spec.drop_subscriptions_before_start flag, + // because if for some reason compute restarts inside VM, + // it will start again with the same spec and flag value. + // + // To handle this, we save the fact of the operation in the database + // in the neon.drop_subscriptions_done table. + // If the table does not exist, we assume that the operation was never performed, so we must do it. + // If table exists, we check if the operation was performed on the current timelilne. + // + let mut drop_subscriptions_done = false; + + if spec.drop_subscriptions_before_start { + let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; + let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); + + info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); + + drop_subscriptions_done = match + client.simple_query(&query).await { + Ok(result) => { + matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) + }, + Err(e) => + { + match e.code() { + Some(&SqlState::UNDEFINED_TABLE) => false, + _ => { + // We don't expect any other error here, except for the schema/table not existing + error!("Error checking if drop subscription operation was already performed: {}", e); + return Err(e.into()); + } + } + } + } + }; + + + let jwks_roles = Arc::new( + spec.as_ref() + .local_proxy_config + .iter() + .flat_map(|it| &it.jwks) + .flatten() + .flat_map(|setting| &setting.role_names) + .cloned() + .collect::>(), + ); + + let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext { + roles, + dbs: databases, + })); + + // Apply special pre drop database phase. + // NOTE: we use the code of RunInEachDatabase phase for parallelism + // and connection management, but we don't really run it in *each* database, + // only in databases, we're about to drop. + info!("Applying PerDatabase (pre-dropdb) phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Run the phase for each database that we're about to drop. + let db_processes = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + if op.action.as_str() == "delete_db" { + Some(op.name.clone()) + } else { + None + } + }) + .map(|dbname| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + // We only need dbname field for this phase, so set other fields to dummy values + let db = DB::UserDB(Database { + name: dbname.clone(), + owner: "cloud_admin".to_string(), + options: None, + restrict_conn: false, + invalid: false, + }); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + [DropLogicalSubscriptions].to_vec(), + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + if let Err(e) = handle.await? { + // Handle the error case where the database does not exist + // We do not check whether the DB exists or not in the deletion phase, + // so we shouldn't be strict about it in pre-deletion cleanup as well. + if e.to_string().contains("does not exist") { + warn!("Error dropping subscription: {}", e); + } else { + return Err(e); + } + }; + } + + for phase in [ + CreateNeonSuperuser, + DropInvalidDatabases, + RenameRoles, + CreateAndAlterRoles, + RenameAndDeleteDatabases, + CreateAndAlterDatabases, + CreateSchemaNeon, + ] { + info!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + info!("Applying RunInEachDatabase2 phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + let db_processes = spec + .cluster + .databases + .iter() + .map(|db| DB::new(db.clone())) + // include + .chain(once(DB::SystemDB)) + .map(|db| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + let db = db.clone(); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let mut phases = vec![ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ]; + + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(DropLogicalSubscriptions); + } + + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + phases, + ); + + Ok(tokio::spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + handle.await??; + } + + let mut phases = vec![ + HandleOtherExtensions, + HandleNeonExtension, // This step depends on CreateSchemaNeon + CreateAvailabilityCheck, + DropRoles, + ]; + + // This step depends on CreateSchemaNeon + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(FinalizeDropLogicalSubscriptions); + } + + // Keep DisablePostgresDBPgAudit phase at the end, + // so that all config operations are audit logged. + match spec.audit_log_level + { + ComputeAudit::Hipaa => { + phases.push(CreatePgauditExtension); + phases.push(CreatePgauditlogtofileExtension); + phases.push(DisablePostgresDBPgAudit); + } + ComputeAudit::Log => { /* not implemented yet */ } + ComputeAudit::Disabled => {} + } + + for phase in phases { + debug!("Applying phase {:?}", &phase); + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + phase, + || async { Ok(&client) }, + ) + .await?; + } + + Ok::<(), anyhow::Error>(()) + })?; + + Ok(()) + } + + /// Apply SQL migrations of the RunInEachDatabase phase. + /// + /// May opt to not connect to databases that don't have any scheduled + /// operations. The function is concurrency-controlled with the provided + /// semaphore. The caller has to make sure the semaphore isn't exhausted. + async fn apply_spec_sql_db( + spec: Arc, + conf: Arc, + ctx: Arc>, + jwks_roles: Arc>, + concurrency_token: Arc, + db: DB, + subphases: Vec, + ) -> Result<()> { + let _permit = concurrency_token.acquire().await?; + + let mut client_conn = None; + + for subphase in subphases { + apply_operations( + spec.clone(), + ctx.clone(), + jwks_roles.clone(), + RunInEachDatabase { + db: db.clone(), + subphase, + }, + // Only connect if apply_operation actually wants a connection. + // It's quite possible this database doesn't need any queries, + // so by not connecting we save time and effort connecting to + // that database. + || async { + if client_conn.is_none() { + let db_client = Self::get_maintenance_client(&conf).await?; + client_conn.replace(db_client); + } + let client = client_conn.as_ref().unwrap(); + Ok(client) + }, + ) + .await?; + } + + drop(client_conn); + + Ok::<(), anyhow::Error>(()) + } + + /// Choose how many concurrent connections to use for applying the spec changes. + pub fn max_service_connections( + &self, + compute_state: &ComputeState, + spec: &ComputeSpec, + ) -> usize { + // If the cluster is in Init state we don't have to deal with user connections, + // and can thus use all `max_connections` connection slots. However, that's generally not + // very efficient, so we generally still limit it to a smaller number. + if compute_state.status == ComputeStatus::Init { + // If the settings contain 'max_connections', use that as template + if let Some(config) = spec.cluster.settings.find("max_connections") { + config.parse::().ok() + } else { + // Otherwise, try to find the setting in the postgresql_conf string + spec.cluster + .postgresql_conf + .iter() + .flat_map(|conf| conf.split("\n")) + .filter_map(|line| { + if !line.contains("max_connections") { + return None; + } + + let (key, value) = line.split_once("=")?; + let key = key + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + let value = value + .trim_start_matches(char::is_whitespace) + .trim_end_matches(char::is_whitespace); + + if key != "max_connections" { + return None; + } + + value.parse::().ok() + }) + .next() + } + // If max_connections is present, use at most 1/3rd of that. + // When max_connections is lower than 30, try to use at least 10 connections, but + // never more than max_connections. + .map(|limit| match limit { + 0..10 => limit, + 10..30 => 10, + 30.. => limit / 3, + }) + // If we didn't find max_connections, default to 10 concurrent connections. + .unwrap_or(10) + } else { + // state == Running + // Because the cluster is already in the Running state, we should assume users are + // already connected to the cluster, and high concurrency could negatively + // impact user connectivity. Therefore, we can limit concurrency to the number of + // reserved superuser connections, which users wouldn't be able to use anyway. + spec.cluster + .settings + .find("superuser_reserved_connections") + .iter() + .filter_map(|val| val.parse::().ok()) + .map(|val| if val > 1 { val - 1 } else { 1 }) + .last() + .unwrap_or(3) + } + } +} #[derive(Clone)] pub enum DB { @@ -57,7 +469,7 @@ pub enum PerDatabasePhase { #[derive(Clone, Debug)] pub enum ApplySpecPhase { - CreateSuperUser, + CreateNeonSuperuser, DropInvalidDatabases, RenameRoles, CreateAndAlterRoles, @@ -65,6 +477,9 @@ pub enum ApplySpecPhase { CreateAndAlterDatabases, CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, + CreatePgauditExtension, + CreatePgauditlogtofileExtension, + DisablePostgresDBPgAudit, HandleOtherExtensions, HandleNeonExtension, CreateAvailabilityCheck, @@ -181,14 +596,10 @@ async fn get_operations<'a>( apply_spec_phase: &'a ApplySpecPhase, ) -> Result + 'a + Send>> { match apply_spec_phase { - ApplySpecPhase::CreateSuperUser => { - let query = construct_superuser_query(spec); - - Ok(Box::new(once(Operation { - query, - comment: None, - }))) - } + ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation { + query: include_str!("sql/create_neon_superuser.sql").to_string(), + comment: None, + }))), ApplySpecPhase::DropInvalidDatabases => { let mut ctx = ctx.write().await; let databases = &mut ctx.dbs; @@ -322,14 +733,15 @@ async fn get_operations<'a>( // We do not check whether the DB exists or not, // Postgres will take care of it for us "delete_db" => { + let (db_name, outer_tag) = op.name.pg_quote_dollar(); // In Postgres we can't drop a database if it is a template. // So we need to unset the template flag first, but it could // be a retry, so we could've already dropped the database. // Check that database exists first to make it idempotent. let unset_template_query: String = format!( include_str!("sql/unset_template_for_drop_dbs.sql"), - datname_str = escape_literal(&op.name), - datname = &op.name.pg_quote() + datname = db_name, + outer_tag = outer_tag, ); // Use FORCE to drop database even if there are active connections. @@ -436,6 +848,8 @@ async fn get_operations<'a>( comment: None, }, Operation { + // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database + // (see https://www.postgresql.org/docs/current/ddl-priv.html) query: format!( "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", db.name.pg_quote() @@ -474,7 +888,10 @@ async fn get_operations<'a>( let edb = match databases.get(&db.name) { Some(edb) => edb, None => { - warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name); + warn!( + "skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", + subphase, db.name + ); return Ok(Box::new(empty())); } }; @@ -492,9 +909,11 @@ async fn get_operations<'a>( PerDatabasePhase::DropLogicalSubscriptions => { match &db { DB::UserDB(db) => { + let (db_name, outer_tag) = db.name.pg_quote_dollar(); let drop_subscription_query: String = format!( include_str!("sql/drop_subscriptions.sql"), - datname_str = escape_literal(&db.name), + datname_str = db_name, + outer_tag = outer_tag, ); let operations = vec![Operation { @@ -533,6 +952,7 @@ async fn get_operations<'a>( DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), DB::UserDB(db) => db.owner.pg_quote(), }; + let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); Some(vec![ // This will reassign all dependent objects to the db owner @@ -547,7 +967,9 @@ async fn get_operations<'a>( Operation { query: format!( include_str!("sql/pre_drop_role_revoke_privileges.sql"), - role_name = quoted, + // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + role_name = escaped_role, + outer_tag = outer_tag, ), comment: None, }, @@ -572,12 +994,14 @@ async fn get_operations<'a>( DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; + let (db_owner, outer_tag) = db.owner.pg_quote_dollar(); let operations = vec![ Operation { query: format!( include_str!("sql/set_public_schema_owner.sql"), - db_owner = db.owner.pg_quote() + db_owner = db_owner, + outer_tag = outer_tag, ), comment: None, }, @@ -697,6 +1121,25 @@ async fn get_operations<'a>( } Ok(Box::new(empty())) } + ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"), + comment: Some(String::from("create pgaudit extensions")), + }))), + ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation { + query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"), + comment: Some(String::from("create pgauditlogtofile extensions")), + }))), + // Disable pgaudit logging for postgres database. + // Postgres is neon system database used by monitors + // and compute_ctl tuning functions and thus generates a lot of noise. + // We do not consider data stored in this database as sensitive. + ApplySpecPhase::DisablePostgresDBPgAudit => { + let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'"; + Ok(Box::new(once(Operation { + query: query.to_string(), + comment: Some(query.to_string()), + }))) + } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ Operation { diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql new file mode 100644 index 0000000000..300645627b --- /dev/null +++ b/compute_tools/src/sql/create_neon_superuser.sql @@ -0,0 +1,8 @@ +DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') + THEN + CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data; + END IF; + END +$$; diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index 03e8e158fa..f5d9420130 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -1,4 +1,4 @@ -DO $$ +DO ${outer_tag}$ DECLARE subname TEXT; BEGIN @@ -9,4 +9,4 @@ BEGIN EXECUTE format('DROP SUBSCRIPTION %I;', subname); END LOOP; END; -$$; +${outer_tag}$; diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql index cdaa7071d3..734607be02 100644 --- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -1,8 +1,7 @@ -SET SESSION ROLE neon_superuser; - -DO $$ +DO ${outer_tag}$ DECLARE schema TEXT; + grantor TEXT; revoke_query TEXT; BEGIN FOR schema IN @@ -15,14 +14,25 @@ BEGIN -- ii) it's easy to add more schemas to the list if needed. WHERE schema_name IN ('public') LOOP - revoke_query := format( - 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;', - schema - ); + FOR grantor IN EXECUTE + format( + 'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s', + -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + quote_literal({role_name}) + ) + LOOP + EXECUTE format('SET LOCAL ROLE %I', grantor); - EXECUTE revoke_query; + revoke_query := format( + 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I', + schema, + -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + {role_name}, + grantor + ); + + EXECUTE revoke_query; + END LOOP; END LOOP; END; -$$; - -RESET ROLE; +${outer_tag}$; diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql index fd061a713e..dc502c6d2d 100644 --- a/compute_tools/src/sql/set_public_schema_owner.sql +++ b/compute_tools/src/sql/set_public_schema_owner.sql @@ -1,5 +1,4 @@ -DO -$$ +DO ${outer_tag}$ DECLARE schema_owner TEXT; BEGIN @@ -16,8 +15,8 @@ $$ IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin' THEN - ALTER SCHEMA public OWNER TO {db_owner}; + EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner}); END IF; END IF; END -$$; \ No newline at end of file +${outer_tag}$; \ No newline at end of file diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql index 6c4343a589..36dc648beb 100644 --- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql +++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql @@ -1,12 +1,12 @@ -DO $$ +DO ${outer_tag}$ BEGIN IF EXISTS( SELECT 1 FROM pg_catalog.pg_database - WHERE datname = {datname_str} + WHERE datname = {datname} ) THEN - ALTER DATABASE {datname} is_template false; + EXECUTE format('ALTER DATABASE %I is_template false', {datname}); END IF; END -$$; \ No newline at end of file +${outer_tag}$; diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs index 024c5b338e..ed27a7cba4 100644 --- a/compute_tools/src/swap.rs +++ b/compute_tools/src/swap.rs @@ -1,10 +1,11 @@ use std::path::Path; -use anyhow::{anyhow, Context}; -use tracing::warn; +use anyhow::{Context, anyhow}; +use tracing::{instrument, warn}; pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; +#[instrument] pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { // run `/neonvm/bin/resize-swap --once {size_bytes}` // diff --git a/compute_tools/tests/config_test.rs b/compute_tools/tests/config_test.rs index 9ab16b1930..7b2bff23d5 100644 --- a/compute_tools/tests/config_test.rs +++ b/compute_tools/tests/config_test.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod config_tests { - use std::fs::{remove_file, File}; + use std::fs::{File, remove_file}; use std::io::{Read, Write}; use std::path::Path; diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 4961bc293d..f2d74ff384 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } + #[test] + fn ident_pg_quote_dollar() { + let test_cases = vec![ + ("name", ("$$name$$", "x")), + ("name$$", ("$x$name$$$x$", "xx")), + ("name$$$", ("$x$name$$$$x$", "xx")), + ("name$$$$", ("$x$name$$$$$x$", "xx")), + ("name$x$", ("$xx$name$x$$xx$", "xxx")), + ]; + + for (input, expected) in test_cases { + let (escaped, tag) = PgIdent::from(input).pg_quote_dollar(); + assert_eq!(escaped, expected.0); + assert_eq!(tag, expected.1); + } + } + #[test] fn generic_options_search() { let generic_options: GenericOptions = Some(vec![ diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index c668e68402..1eac4f7ff0 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -25,7 +25,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno; use nix::fcntl::{FcntlArg, FdFlag}; -use nix::sys::signal::{kill, Signal}; +use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use utils::pid_file::{self, PidFileRead}; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 7d908ccae9..ba1411b615 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -5,7 +5,16 @@ //! easier to work with locally. The python tests in `test_runner` //! rely on `neon_local` to set up the environment for each test. //! -use anyhow::{anyhow, bail, Context, Result}; +use std::borrow::Cow; +use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; +use std::path::PathBuf; +use std::process::exit; +use std::str::FromStr; +use std::time::Duration; + +use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; @@ -19,7 +28,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; -use nix::fcntl::{flock, FlockArg}; +use nix::fcntl::{FlockArg, flock}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -31,27 +40,18 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; -use std::borrow::Cow; -use std::collections::{BTreeSet, HashMap}; -use std::fs::File; -use std::os::fd::AsRawFd; -use std::path::PathBuf; -use std::process::exit; -use std::str::FromStr; -use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; use url::Host; -use utils::{ - auth::{Claims, Scope}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, - project_git_version, -}; +use utils::auth::{Claims, Scope}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::project_git_version; // Default id of a safekeeper node, if not specified on the command line. const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); @@ -597,7 +597,15 @@ struct EndpointStartCmdArgs { #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, - #[clap(long)] + #[clap( + long, + help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations." + )] + safekeepers_generation: Option, + #[clap( + long, + help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override." + )] safekeepers: Option, #[clap( @@ -618,9 +626,9 @@ struct EndpointStartCmdArgs { )] allow_multiple: bool, - #[clap(short = 't', long, help = "timeout until we fail the command")] - #[arg(default_value = "10s")] - start_timeout: humantime::Duration, + #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] + #[arg(default_value = "90s")] + start_timeout: Duration, } #[derive(clap::Args)] @@ -921,7 +929,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. if args.num_pageservers.is_some() { - bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + bail!( + "Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead" + ); } // load and parse the file let contents = std::fs::read_to_string(config_path).with_context(|| { @@ -953,6 +963,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { id: pageserver_id, listen_pg_addr: format!("127.0.0.1:{pg_port}"), listen_http_addr: format!("127.0.0.1:{http_port}"), + listen_https_addr: None, pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, other: Default::default(), @@ -967,6 +978,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, control_plane_compute_hook_api: None, + generate_local_ssl_certs: false, } }; @@ -1315,10 +1327,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { - bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + bail!( + "Cannot start a node in hot standby mode when it is already configured as a static replica" + ) } (ComputeMode::Primary, true) => { - bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + bail!( + "Cannot start a node as a hot standby replica, it is already configured as primary node" + ) } _ => {} } @@ -1345,6 +1361,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let pageserver_id = args.endpoint_pageserver_id; let remote_ext_config = &args.remote_ext_config; + let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1420,11 +1437,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint .start( &auth_token, + safekeepers_generation, safekeepers, pageservers, remote_ext_config.as_ref(), stripe_size.0 as usize, args.create_test_user, + args.start_timeout, ) .await?; } diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index c8ac5d8981..1b507bb384 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -8,7 +8,6 @@ use std::time::Duration; use anyhow::Context; - use camino::Utf8PathBuf; use crate::{background_process, local_env}; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 407578abb8..b46d616827 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,29 +37,24 @@ //! ``` //! use std::collections::BTreeMap; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::net::TcpStream; +use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; use std::process::Command; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; -use std::time::SystemTime; -use std::time::UNIX_EPOCH; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; -use compute_api::responses::ComputeCtlConfig; -use compute_api::spec::Database; -use compute_api::spec::PgIdent; -use compute_api::spec::RemoteExtSpec; -use compute_api::spec::Role; -use nix::sys::signal::kill; -use nix::sys::signal::Signal; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::{ + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, + RemoteExtSpec, Role, +}; +use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tracing::debug; use url::Host; @@ -69,9 +64,6 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use crate::storage_controller::StorageController; -use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; -use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; - // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { @@ -237,7 +229,9 @@ impl ComputeControlPlane { }); if let Some((key, _)) = duplicates.next() { - bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."); + bail!( + "attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported." + ); } } Ok(()) @@ -584,14 +578,17 @@ impl Endpoint { Ok(safekeeper_connstrings) } + #[allow(clippy::too_many_arguments)] pub async fn start( &self, auth_token: &Option, + safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, shard_stripe_size: usize, create_test_user: bool, + start_timeout: Duration, ) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); @@ -663,6 +660,7 @@ impl Endpoint { timeline_id: Some(self.timeline_id), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), + safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, storage_auth_token: auth_token.clone(), remote_extensions, @@ -671,6 +669,7 @@ impl Endpoint { local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, + audit_log_level: ComputeAudit::Disabled, }; // this strange code is needed to support respec() in tests @@ -778,17 +777,18 @@ impl Endpoint { std::fs::write(pidfile_path, pid.to_string())?; // Wait for it to start - let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min + let start_at = Instant::now(); loop { - attempt += 1; match self.get_status().await { Ok(state) => { match state.status { ComputeStatus::Init => { - if attempt == MAX_ATTEMPTS { - bail!("compute startup timed out; still in Init state"); + if Instant::now().duration_since(start_at) > start_timeout { + bail!( + "compute startup timed out {:?}; still in Init state", + start_timeout + ); } // keep retrying } @@ -815,8 +815,11 @@ impl Endpoint { } } Err(e) => { - if attempt == MAX_ATTEMPTS { - return Err(e).context("timed out waiting to connect to compute_ctl HTTP"); + if Instant::now().duration_since(start_at) > start_timeout { + return Err(e).context(format!( + "timed out {:?} waiting to connect to compute_ctl HTTP", + start_timeout, + )); } } } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2fe4cd5202..ec9eb74e6f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,28 +3,22 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use std::collections::HashMap; +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Duration; +use std::{env, fs}; +use anyhow::{Context, bail}; use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::env; -use std::fs; -use std::net::IpAddr; -use std::net::Ipv4Addr; -use std::net::SocketAddr; -use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::time::Duration; -use utils::{ - auth::{encode_from_key_file, Claims}, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, -}; +use utils::auth::{Claims, encode_from_key_file}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; -use crate::pageserver::PageServerNode; -use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; +use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 16; @@ -87,6 +81,10 @@ pub struct LocalEnv { // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". pub branch_name_mappings: HashMap>, + + /// Flag to generate SSL certificates for components that need it. + /// Also generates root CA certificate that is used to sign all other certificates. + pub generate_local_ssl_certs: bool, } /// On-disk state stored in `.neon/config`. @@ -108,6 +106,10 @@ pub struct OnDiskConfig { pub control_plane_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, + // Note: skip serializing because in compat tests old storage controller fails + // to load new config file. May be removed after this field is in release branch. + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub generate_local_ssl_certs: bool, } fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> @@ -135,6 +137,7 @@ pub struct NeonLocalInitConf { pub safekeepers: Vec, pub control_plane_api: Option, pub control_plane_compute_hook_api: Option>, + pub generate_local_ssl_certs: bool, } /// Broker config for cluster internal communication. @@ -171,6 +174,11 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub long_reconcile_threshold: Option, + + #[serde(default)] + pub use_https_pageserver_api: bool, + + pub timelines_onto_safekeepers: bool, } impl NeonStorageControllerConf { @@ -194,6 +202,8 @@ impl Default for NeonStorageControllerConf { max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, + use_https_pageserver_api: false, + timelines_onto_safekeepers: false, } } } @@ -223,6 +233,7 @@ pub struct PageServerConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, pub no_sync: bool, @@ -234,6 +245,7 @@ impl Default for PageServerConf { id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), + listen_https_addr: None, pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, no_sync: false, @@ -249,6 +261,7 @@ pub struct NeonLocalInitPageserverConf { pub id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, #[serde(default, skip_serializing_if = "std::ops::Not::not")] @@ -263,6 +276,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { id, listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -272,6 +286,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { id: *id, listen_pg_addr: listen_pg_addr.clone(), listen_http_addr: listen_http_addr.clone(), + listen_https_addr: listen_https_addr.clone(), pg_auth_type: *pg_auth_type, http_auth_type: *http_auth_type, no_sync: *no_sync, @@ -416,6 +431,41 @@ impl LocalEnv { } } + pub fn ssl_ca_cert_path(&self) -> Option { + if self.generate_local_ssl_certs { + Some(self.base_data_dir.join("rootCA.crt")) + } else { + None + } + } + + pub fn ssl_ca_key_path(&self) -> Option { + if self.generate_local_ssl_certs { + Some(self.base_data_dir.join("rootCA.key")) + } else { + None + } + } + + pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> { + let cert_path = self.ssl_ca_cert_path().unwrap(); + let key_path = self.ssl_ca_key_path().unwrap(); + if !fs::exists(cert_path.as_path())? { + generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?; + } + Ok(()) + } + + pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { + self.generate_ssl_ca_cert()?; + generate_ssl_cert( + cert_path, + key_path, + self.ssl_ca_cert_path().unwrap().as_path(), + self.ssl_ca_key_path().unwrap().as_path(), + ) + } + /// Inspect the base data directory and extract the instance id and instance directory path /// for all storage controller instances pub async fn storage_controller_instances(&self) -> std::io::Result> { @@ -465,7 +515,9 @@ impl LocalEnv { if old_timeline_id == &timeline_id { Ok(()) } else { - bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + bail!( + "branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}" + ); } } else { existing_values.push((tenant_id, timeline_id)); @@ -523,6 +575,7 @@ impl LocalEnv { control_plane_api, control_plane_compute_hook_api, branch_name_mappings, + generate_local_ssl_certs, } = on_disk_config; LocalEnv { base_data_dir: repopath.to_owned(), @@ -537,6 +590,7 @@ impl LocalEnv { control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api, branch_name_mappings, + generate_local_ssl_certs, } }; @@ -572,6 +626,7 @@ impl LocalEnv { struct PageserverConfigTomlSubset { listen_pg_addr: String, listen_http_addr: String, + listen_https_addr: Option, pg_auth_type: AuthType, http_auth_type: AuthType, #[serde(default)] @@ -596,6 +651,7 @@ impl LocalEnv { let PageserverConfigTomlSubset { listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -613,6 +669,7 @@ impl LocalEnv { }, listen_pg_addr, listen_http_addr, + listen_https_addr, pg_auth_type, http_auth_type, no_sync, @@ -640,6 +697,7 @@ impl LocalEnv { control_plane_api: Some(self.control_plane_api.clone()), control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), branch_name_mappings: self.branch_name_mappings.clone(), + generate_local_ssl_certs: self.generate_local_ssl_certs, }, ) } @@ -722,6 +780,7 @@ impl LocalEnv { safekeepers, control_plane_api, control_plane_compute_hook_api, + generate_local_ssl_certs, } = conf; // Find postgres binaries. @@ -770,8 +829,13 @@ impl LocalEnv { control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), branch_name_mappings: Default::default(), + generate_local_ssl_certs, }; + if generate_local_ssl_certs { + env.generate_ssl_ca_cert()?; + } + // create endpoints dir fs::create_dir_all(env.endpoints_path())?; @@ -855,3 +919,80 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } + +fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> { + // openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \ + // -out rootCA.crt -keyout rootCA.key + let keygen_output = Command::new("openssl") + .args([ + "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500", + ]) + .args(["-subj", "/CN=Neon Local CA"]) + .args(["-out", cert_path.to_str().unwrap()]) + .args(["-keyout", key_path.to_str().unwrap()]) + .output() + .context("failed to generate CA certificate")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + Ok(()) +} + +fn generate_ssl_cert( + cert_path: &Path, + key_path: &Path, + ca_cert_path: &Path, + ca_key_path: &Path, +) -> anyhow::Result<()> { + // Generate Certificate Signing Request (CSR). + let mut csr_path = cert_path.to_path_buf(); + csr_path.set_extension(".csr"); + + // openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \ + // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" + let keygen_output = Command::new("openssl") + .args(["req", "-new", "-nodes"]) + .args(["-newkey", "rsa:2048"]) + .args(["-subj", "/CN=localhost"]) + .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"]) + .args(["-keyout", key_path.to_str().unwrap()]) + .args(["-out", csr_path.to_str().unwrap()]) + .output() + .context("failed to generate CSR")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + + // Sign CSR with CA key. + // + // openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \ + // -out server.crt -days 36500 -copy_extensions copyall + let keygen_output = Command::new("openssl") + .args(["x509", "-req"]) + .args(["-in", csr_path.to_str().unwrap()]) + .args(["-CA", ca_cert_path.to_str().unwrap()]) + .args(["-CAkey", ca_key_path.to_str().unwrap()]) + .arg("-CAcreateserial") + .args(["-out", cert_path.to_str().unwrap()]) + .args(["-days", "36500"]) + .args(["-copy_extensions", "copyall"]) + .output() + .context("failed to sign CSR")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + + // Remove CSR file as it's not needed anymore. + fs::remove_file(csr_path)?; + + Ok(()) +} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2bf89b7bfa..eeaad10d26 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -7,7 +7,6 @@ //! ``` //! use std::collections::HashMap; - use std::io; use std::io::Write; use std::num::NonZeroU64; @@ -15,22 +14,20 @@ use std::path::PathBuf; use std::str::FromStr; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use camino::Utf8PathBuf; use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; -use postgres_connection::{parse_host_port, PgConnectionConfig}; +use postgres_connection::{PgConnectionConfig, parse_host_port}; +use reqwest::Certificate; use utils::auth::{Claims, Scope}; -use utils::id::NodeId; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; -use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; -use crate::{background_process, local_env::LocalEnv}; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonLocalInitPageserverConf, PageServerConf}; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver"; @@ -53,12 +50,29 @@ impl PageServerNode { let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); + + let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| { + let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist"); + Certificate::from_pem(&buf).expect("CA certificate should be valid") + }); + + let endpoint = if env.storage_controller.use_https_pageserver_api { + format!( + "https://{}", + conf.listen_https_addr.as_ref().expect( + "listen https address should be specified if use_https_pageserver_api is on" + ) + ) + } else { + format!("http://{}", conf.listen_http_addr) + }; + Self { pg_connection_config: PgConnectionConfig::new_host_port(host, port), conf: conf.clone(), env: env.clone(), http_client: mgmt_api::Client::new( - format!("http://{}", conf.listen_http_addr), + endpoint, { match conf.http_auth_type { AuthType::Trust => None, @@ -69,7 +83,9 @@ impl PageServerNode { } } .as_deref(), - ), + ssl_ca_cert, + ) + .expect("Client constructs with no errors"), } } @@ -81,7 +97,11 @@ impl PageServerNode { &self, conf: NeonLocalInitPageserverConf, ) -> anyhow::Result { - assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + assert_eq!( + &PageServerConf::from(&conf), + &self.conf, + "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully" + ); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) @@ -220,6 +240,13 @@ impl PageServerNode { .context("write identity toml")?; drop(identity_toml); + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + datadir.join("server.crt").as_path(), + datadir.join("server.key").as_path(), + )?; + } + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config // Write metadata file, used by pageserver on startup to register itself with @@ -230,6 +257,15 @@ impl PageServerNode { parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); let http_port = http_port.unwrap_or(9898); + let https_port = match self.conf.listen_https_addr.as_ref() { + Some(https_addr) => { + let (_https_host, https_port) = + parse_host_port(https_addr).expect("Unable to parse listen_https_addr"); + Some(https_port.unwrap_or(9899)) + } + None => None, + }; + // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. @@ -240,6 +276,7 @@ impl PageServerNode { postgres_port: self.pg_connection_config.port(), http_host: "localhost".to_string(), http_port, + https_port, other: HashMap::from([( "availability_zone_id".to_string(), serde_json::json!(az_id), diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 5aee12dc97..a824af9490 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -1,3 +1,6 @@ +use std::collections::HashMap; +use std::fmt; + /// /// Module for parsing postgresql.conf file. /// @@ -6,8 +9,6 @@ /// funny stuff like include-directives or funny escaping. use once_cell::sync::Lazy; use regex::Regex; -use std::collections::HashMap; -use std::fmt; /// In-memory representation of a postgresql.conf file #[derive(Default, Debug)] diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index ce7751fb14..70915d5aaf 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,18 +14,15 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; +use http_utils::error::HttpErrorBody; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; - -use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; use utils::id::NodeId; -use crate::{ - background_process, - local_env::{LocalEnv, SafekeeperConf}, -}; +use crate::background_process; +use crate::local_env::{LocalEnv, SafekeeperConf}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0fadb9c5fe..439d7936a7 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,44 +1,36 @@ -use crate::{ - background_process, - local_env::{LocalEnv, NeonStorageControllerConf}, -}; +use std::ffi::OsStr; +use std::fs; +use std::net::SocketAddr; +use std::path::PathBuf; +use std::process::ExitStatus; +use std::str::FromStr; +use std::sync::OnceLock; +use std::time::{Duration, Instant}; + use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; -use pageserver_api::{ - controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, - TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest, - TenantShardMigrateResponse, - }, - models::{ - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardStripeSize, TenantShardId}, +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + TenantCreateResponse, TenantLocateResponse, }; +use pageserver_api::models::{TimelineCreateRequest, TimelineInfo}; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{ - ffi::OsStr, - fs, - net::SocketAddr, - path::PathBuf, - process::ExitStatus, - str::FromStr, - sync::OnceLock, - time::{Duration, Instant}, -}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; use tokio::process::Command; use tracing::instrument; use url::Url; -use utils::{ - auth::{encode_from_key_file, Claims, Scope}, - id::{NodeId, TenantId}, -}; +use utils::auth::{Claims, Scope, encode_from_key_file}; +use utils::id::{NodeId, TenantId}; use whoami::username; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonStorageControllerConf}; + pub struct StorageController { env: LocalEnv, private_key: Option>, @@ -96,7 +88,8 @@ pub struct AttachHookRequest { #[derive(Serialize, Deserialize)] pub struct AttachHookResponse { - pub gen: Option, + #[serde(rename = "gen")] + pub generation: Option, } #[derive(Serialize, Deserialize)] @@ -541,6 +534,14 @@ impl StorageController { args.push("--start-as-candidate".to_string()); } + if self.config.use_https_pageserver_api { + args.push("--use-https-pageserver-api".to_string()); + } + + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); + } + if let Some(private_key) = &self.private_key { let claims = Claims::new(None, Scope::PageServerApi); let jwt_token = @@ -583,6 +584,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.config.timelines_onto_safekeepers { + args.push("--timelines-onto-safekeepers".to_string()); + } + background_process::start_process( COMMAND, &instance_dir, @@ -779,7 +784,7 @@ impl StorageController { ) .await?; - Ok(response.gen) + Ok(response.generation) } #[instrument(skip(self))] @@ -829,41 +834,6 @@ impl StorageController { .await } - #[instrument(skip(self))] - pub async fn tenant_migrate( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - node_id, - migration_config: None, - }), - ) - .await - } - - #[instrument(skip(self), fields(%tenant_id, %new_shard_count))] - pub async fn tenant_split( - &self, - tenant_id: TenantId, - new_shard_count: u8, - new_stripe_size: Option, - ) -> anyhow::Result { - self.dispatch( - Method::PUT, - format!("control/v1/tenant/{tenant_id}/shard_split"), - Some(TenantShardSplitRequest { - new_shard_count, - new_stripe_size, - }), - ) - .await - } - #[instrument(skip_all, fields(node_id=%req.node_id))] pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 40b86e4110..b5c4f21e97 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,34 +1,28 @@ -use futures::StreamExt; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::Duration; use clap::{Parser, Subcommand}; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, - TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, - }, - models::{ - EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, - TenantShardSplitRequest, TenantShardSplitResponse, - }, - shard::{ShardStripeSize, TenantShardId}, -}; -use pageserver_client::mgmt_api::{self}; -use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId, TimelineId}; - +use futures::StreamExt; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest, + NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, + PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }; +use pageserver_api::models::{ + EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters, + TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, +}; +use pageserver_api::shard::{ShardStripeSize, TenantShardId}; +use pageserver_client::mgmt_api::{self}; +use reqwest::{Method, StatusCode, Url}; use storage_controller_client::control_api::Client; +use utils::id::{NodeId, TenantId, TimelineId}; #[derive(Subcommand, Debug)] enum Command { @@ -119,6 +113,15 @@ enum Command { tenant_shard_id: TenantShardId, #[arg(long)] node: NodeId, + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + prewarm: bool, + #[arg(long, default_value_t = false, action = clap::ArgAction::Set)] + override_scheduler: bool, + }, + /// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate + TenantShardWatch { + #[arg(long)] + tenant_shard_id: TenantShardId, }, /// Migrate the secondary location for a tenant shard to a specific pageserver. TenantShardMigrateSecondary { @@ -276,6 +279,10 @@ struct Cli { /// a token with both scopes to use with this tool. jwt: Option, + #[arg(long)] + /// Trusted root CA certificate to use in https APIs. + ssl_ca_file: Option, + #[command(subcommand)] command: Command, } @@ -386,9 +393,17 @@ async fn main() -> anyhow::Result<()> { let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + let ssl_ca_cert = match &cli.ssl_ca_file { + Some(ssl_ca_file) => { + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(reqwest::Certificate::from_pem(&buf)?) + } + None => None, + }; + let mut trimmed = cli.api.to_string(); trimmed.pop(); - let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?; match cli.command { Command::NodeRegister { @@ -626,19 +641,43 @@ async fn main() -> anyhow::Result<()> { Command::TenantShardMigrate { tenant_shard_id, node, + prewarm, + override_scheduler, } => { - let req = TenantShardMigrateRequest { - node_id: node, - migration_config: None, + let migration_config = MigrationConfig { + prewarm, + override_scheduler, + ..Default::default() }; - storcon_client + let req = TenantShardMigrateRequest { + node_id: node, + origin_node_id: None, + migration_config, + }; + + match storcon_client .dispatch::( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), Some(req), ) - .await?; + .await + { + Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => { + anyhow::bail!( + "Migration to {node} rejected, may require `--force` ({}) ", + msg + ); + } + Err(e) => return Err(e.into()), + Ok(_) => {} + } + + watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?; + } + Command::TenantShardWatch { tenant_shard_id } => { + watch_tenant_shard(storcon_client, tenant_shard_id, None).await?; } Command::TenantShardMigrateSecondary { tenant_shard_id, @@ -646,7 +685,8 @@ async fn main() -> anyhow::Result<()> { } => { let req = TenantShardMigrateRequest { node_id: node, - migration_config: None, + origin_node_id: None, + migration_config: MigrationConfig::default(), }; storcon_client @@ -921,7 +961,9 @@ async fn main() -> anyhow::Result<()> { } Command::TenantDrop { tenant_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>( @@ -933,7 +975,9 @@ async fn main() -> anyhow::Result<()> { } Command::NodeDrop { node_id, unclean } => { if !unclean { - anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + anyhow::bail!( + "This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed." + ) } storcon_client .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) @@ -1108,7 +1152,8 @@ async fn main() -> anyhow::Result<()> { format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), Some(TenantShardMigrateRequest { node_id: mv.to, - migration_config: None, + origin_node_id: Some(mv.from), + migration_config: MigrationConfig::default(), }), ) .await @@ -1287,3 +1332,68 @@ async fn main() -> anyhow::Result<()> { Ok(()) } + +static WATCH_INTERVAL: Duration = Duration::from_secs(5); + +async fn watch_tenant_shard( + storcon_client: Client, + tenant_shard_id: TenantShardId, + until_migrated_to: Option, +) -> anyhow::Result<()> { + if let Some(until_migrated_to) = until_migrated_to { + println!( + "Waiting for tenant shard {} to be migrated to node {}", + tenant_shard_id, until_migrated_to + ); + } + + loop { + let desc = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{}", tenant_shard_id.tenant_id), + None, + ) + .await?; + + // Output the current state of the tenant shard + let shard = desc + .shards + .iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) + .ok_or(anyhow::anyhow!("Tenant shard not found"))?; + let summary = format!( + "attached: {} secondary: {} {}", + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or("none".to_string()), + shard + .node_secondary + .iter() + .map(|n| n.to_string()) + .collect::>() + .join(","), + if shard.is_reconciling { + "(reconciler active)" + } else { + "(reconciler idle)" + } + ); + println!("{}", summary); + + // Maybe drop out if we finished migration + if let Some(until_migrated_to) = until_migrated_to { + if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling { + println!( + "Tenant shard {} is now on node {}", + tenant_shard_id, until_migrated_to + ); + break; + } + } + + tokio::time::sleep(WATCH_INTERVAL).await; + } + Ok(()) +} diff --git a/deny.toml b/deny.toml index b551405568..ed7aa9ef9f 100644 --- a/deny.toml +++ b/deny.toml @@ -27,6 +27,10 @@ yanked = "warn" id = "RUSTSEC-2023-0071" reason = "the marvin attack only affects private key decryption, not public key signature verification" +[[advisories.ignore]] +id = "RUSTSEC-2024-0436" +reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact." + # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 489d60f38c..95d4ff7b2a 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch index a4c46e93ce..c050ab8d00 100644 --- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch +++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch @@ -7,7 +7,7 @@ index f255fe6..0a0fa65 100644 GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe -REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) -+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) ++REGRESS_OPTS = --use-existing --dbname=contrib_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF) SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets! IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=)) PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS))) diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index c2168c47af..51d1e40802 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -6,12 +6,16 @@ generate_id() { local -n resvar=$1 printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM } -if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then - echo OLDTAG and NEWTAG must be defined +echo "${OLD_COMPUTE_TAG}" +echo "${NEW_COMPUTE_TAG}" +echo "${TEST_EXTENSIONS_TAG}" +if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then + echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set exit 1 fi export PG_VERSION=${PG_VERSION:-16} export PG_TEST_VERSION=${PG_VERSION} +# Waits for compute node is ready function wait_for_ready { TIME=0 while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do @@ -23,11 +27,45 @@ function wait_for_ready { exit 2 fi } +# Creates extensions. Gets a string with space-separated extensions as a parameter function create_extensions() { for ext in ${1}; do docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" done } +# Creates a new timeline. Gets the parent ID and an extension name as parameters. +# Saves the timeline ID in the variable EXT_TIMELINE +function create_timeline() { + generate_id new_timeline_id + + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${1}\"}" + "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + EXT_TIMELINE[${2}]=${new_timeline_id} +} +# Checks if the timeline ID of the compute node is expected. Gets the timeline ID as a parameter +function check_timeline() { + TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + if [ "${TID}" != "${1}" ]; then + echo Timeline mismatch + exit 1 + fi +} +# Restarts the compute node with the required compute tag and timeline. +# Accepts the tag for the compute node and the timeline as parameters. +function restart_compute() { + docker compose down compute compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + wait_for_ready + check_timeline ${2} +} +declare -A EXT_TIMELINE EXTENSIONS='[ {"extname": "plv8", "extdir": "plv8-src"}, {"extname": "vector", "extdir": "pgvector-src"}, @@ -47,7 +85,7 @@ EXTENSIONS='[ {"extname": "pg_repack", "extdir": "pg_repack-src"} ]' EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) -TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" @@ -55,12 +93,14 @@ create_extensions "${EXTNAMES}" query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") docker compose --profile test-extensions down -TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate wait_for_ready docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" -docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression" -docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap" +tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") +EXT_TIMELINE["main"]=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") +create_timeline "${EXT_TIMELINE["main"]}" init +restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE["init"]}" create_extensions "${EXTNAMES}" if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then exts="${EXTNAMES}" @@ -71,29 +111,13 @@ fi if [ -z "${exts}" ]; then echo "No extensions were upgraded" else - tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") - timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") for ext in ${exts}; do echo Testing ${ext}... + create_timeline "${EXT_TIMELINE["main"]}" ${ext} EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') - generate_id new_timeline_id - PARAMS=( - -sbf - -X POST - -H "Content-Type: application/json" - -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}" - "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" - ) - result=$(curl "${PARAMS[@]}") - echo $result | jq . - TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready - COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready - wait_for_ready - TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") - if [ ${TID} != ${new_timeline_id} ]; then - echo Timeline mismatch - exit 1 - fi + restart_compute "${OLD_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" + docker compose exec neon-test-extensions psql -d contrib_regression -c "CREATE EXTENSION ${ext} CASCADE" + restart_compute "${NEW_COMPUTE_TAG}" "${EXT_TIMELINE[${ext}]}" docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs diff --git a/docs/rfcs/041-rel-sparse-keyspace.md b/docs/rfcs/041-rel-sparse-keyspace.md new file mode 100644 index 0000000000..03e68bd5c1 --- /dev/null +++ b/docs/rfcs/041-rel-sparse-keyspace.md @@ -0,0 +1,201 @@ +# Sparse Keyspace for Relation Directories + +## Summary + +This is an RFC describing a new storage strategy for storing relation directories. + +## Motivation + +Postgres maintains a directory structure for databases and relations. In Neon, we store these information +by serializing the directory data in a single key (see `pgdatadir_mapping.rs`). + +```rust +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 + +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +``` + +We have a dedicated structure on the ingestion path to serialize the relation directory into this single key. + +```rust +#[derive(Debug, Serialize, Deserialize, Default)] +pub(crate) struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + pub(crate) rels: HashSet<(Oid, u8)>, +} +``` + +The current codebase has the following three access patterns for the relation directory. + +1. Check if a relation exists. +2. List all relations. +3. Create/drop a relation. + +For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the +hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get +and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back. + +If we have 100k relations in a database, we would have a 100k-large hash set. Then, every +relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the +relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path, +we would have to deserialize this super big 100k-large key before checking if a single relation exists. + +In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how +to seamlessly migrate users to use the new keyspace. + +The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316). + +## Key Mapping + +We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in +[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>` +for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`), +into the key. + +```plain +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted +(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists +``` + +Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be +implemented as follows. + +1. Check if a relation exists: check if the key maps to "exists". +2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will + be removed during image layer generation upon compaction. + +Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum. +The mapping is implemented as `rel_tag_sparse_key` in the PoC patch. + +## Changes to Sparse Keyspace + +Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir +information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs +to be updated accordingly to accommodate such "inherited sparse keys". This is done in +[PR#10313](https://github.com/neondatabase/neon/pull/10313). + +## Coexistence of the Old and New Keyspaces + +Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the +ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read +path needs to combine the data from both keyspaces. + +Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the +new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration +process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the +migration can happen seamlessly and imposes no potential downtime for the user. + +With the coexistence assumption, the 3 reldir operations will be implemented as follows: + +1. Check if a relation exists + - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly + return it to the user. + - Otherwise, deserialize the old reldir key and get the result. +2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key. + Combine them to obtain the final result. +3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace. + - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check. + - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace. + - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key, + remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace. + - The delete tombstone will be removed during image layer generation upon compaction. + +This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total +amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction. +There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal +with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives +us `O(1)` complexity after fully opt-in the sparse keyspace. + +The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible +to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN. + +We will introduce a config item and an index_part record to record the current status of the migration process. + +- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace. +- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace. + +If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update +`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to +`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still +read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only: +once v2 is enabled, the user cannot go back to v1. + +## Next Steps + +### Full Migration + +This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and +v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this +code path, we must ensure the timeline has no old reldir data. + +We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces: +the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while +copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in +the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers` +process discovers the following keys at this LSN. + +```plain +db1/reldir_key -> (table 1, table 2, table 3) +...db1 rel keys +db2/reldir_key -> (table 4, table 5, table 6) +...db2 rel keys +sparse_reldir_db2_table7 -> exists +sparse_reldir_db1_table8 -> deleted +``` + +It will generate the following keys: + +```plain +db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`. +...db1 rel keys +db2/reldir_key -> () +...db2 rel keys + +-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180 +sparse_reldir_db1_table1 -> exists +sparse_reldir_db1_table2 -> exists +sparse_reldir_db1_table3 -> exists +sparse_reldir_db2_table4 -> exists +sparse_reldir_db2_table5 -> exists +sparse_reldir_db2_table6 -> exists +sparse_reldir_db2_table7 -> exists +-- end image layer for the sparse keyspace at sparse_reldir_prefix+1 + +# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace. +# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there +# are no correctness issue. +``` + +We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before +we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images +above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or +in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to +`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we +don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers. + +The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code. + +### Consolidate Relation Size Keys + +We have relsize at the end of all relation nodes. + +```plain +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +``` + +This means that computing logical size requires us to do several single-key gets across the keyspace, +potentially requiring downloading many layer files. We could consolidate them into a single +keyspace, improving logical size calculation performance. + +### Migrate DBDir Keys + +We assume the number of databases created by the users will be small, and therefore, the current way +of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into +the sparse keyspace to support large amount of databases. diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c11a1b6688..0d1618c1b2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compute_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 0c256cae2e..3fbdfcf83f 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,11 +1,10 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use crate::{ - privilege::Privilege, - responses::ComputeCtlConfig, - spec::{ComputeSpec, ExtVersion, PgIdent}, -}; use serde::{Deserialize, Serialize}; +use crate::privilege::Privilege; +use crate::responses::ComputeCtlConfig; +use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a6248019d9..3300fbf7dd 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,10 +6,8 @@ use chrono::{DateTime, Utc}; use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; -use crate::{ - privilege::Privilege, - spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, -}; +use crate::privilege::Privilege; +use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -136,8 +134,10 @@ pub struct CatalogObjects { pub databases: Vec, } -#[derive(Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ComputeCtlConfig { + /// Set of JSON web keys that the compute can use to authenticate + /// communication from the control plane. pub jwks: JwkSet, } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 8fffae92fb..77f2e1e631 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,13 +5,12 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use regex::Regex; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use regex::Regex; -use remote_storage::RemotePath; - /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; @@ -102,6 +101,17 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + /// Safekeeper membership config generation. It is put in + /// neon.safekeepers GUC and serves two purposes: + /// 1) Non zero value forces walproposer to use membership configurations. + /// 2) If walproposer wants to update list of safekeepers to connect to + /// taking them from some safekeeper mconf, it should check what value + /// is newer by comparing the generation. + /// + /// Note: it could be SafekeeperGeneration, but this needs linking + /// compute_ctl with postgres_ffi. + #[serde(default)] + pub safekeepers_generation: Option, #[serde(default)] pub safekeeper_connstrings: Vec, @@ -145,6 +155,16 @@ pub struct ComputeSpec { /// over the same replication content from publisher. #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, + + /// Log level for audit logging: + /// + /// Disabled - no audit logging. This is the default. + /// log - log masked statements to the postgres log using pgaudit extension + /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension + /// + /// Extensions should be present in shared_preload_libraries + #[serde(default)] + pub audit_log_level: ComputeAudit, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -252,6 +272,17 @@ pub enum ComputeMode { Replica, } +/// Log level for audit logging +/// Disabled, log, hipaa +/// Default is Disabled +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub enum ComputeAudit { + #[default] + Disabled, + Log, + Hipaa, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, @@ -339,9 +370,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use std::fs::File; + use super::*; + #[test] fn allow_installing_remote_extensions() { let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 0e517e3856..77f130950e 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "consumption_metrics" version = "0.1.0" -edition = "2021" +edition = "2024" license = "Apache-2.0" [dependencies] diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs index 6661d59871..8882cd3b56 100644 --- a/libs/desim/src/chan.rs +++ b/libs/desim/src/chan.rs @@ -1,4 +1,5 @@ -use std::{collections::VecDeque, sync::Arc}; +use std::collections::VecDeque; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index 9d44bd7741..df8b071c06 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -1,11 +1,7 @@ -use std::{ - panic::AssertUnwindSafe, - sync::{ - atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, - mpsc, Arc, OnceLock, - }, - thread::JoinHandle, -}; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, Ordering}; +use std::sync::{Arc, OnceLock, mpsc}; +use std::thread::JoinHandle; use tracing::{debug, error, trace}; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs index e15a714daa..cf096dba80 100644 --- a/libs/desim/src/network.rs +++ b/libs/desim/src/network.rs @@ -1,26 +1,19 @@ -use std::{ - cmp::Ordering, - collections::{BinaryHeap, VecDeque}, - fmt::{self, Debug}, - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::{self, Debug}; +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; -use parking_lot::{ - lock_api::{MappedMutexGuard, MutexGuard}, - Mutex, RawMutex, -}; +use parking_lot::lock_api::{MappedMutexGuard, MutexGuard}; +use parking_lot::{Mutex, RawMutex}; use rand::rngs::StdRng; use tracing::debug; -use crate::{ - executor::{self, ThreadContext}, - options::NetworkOptions, - proto::NetEvent, - proto::NodeEvent, -}; - -use super::{chan::Chan, proto::AnyMessage}; +use super::chan::Chan; +use super::proto::AnyMessage; +use crate::executor::{self, ThreadContext}; +use crate::options::NetworkOptions; +use crate::proto::{NetEvent, NodeEvent}; pub struct NetworkTask { options: Arc, diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs index 7744a9f5e1..e0cde7b284 100644 --- a/libs/desim/src/node_os.rs +++ b/libs/desim/src/node_os.rs @@ -2,14 +2,11 @@ use std::sync::Arc; use rand::Rng; +use super::chan::Chan; +use super::network::TCP; +use super::world::{Node, NodeId, World}; use crate::proto::NodeEvent; -use super::{ - chan::Chan, - network::TCP, - world::{Node, NodeId, World}, -}; - /// Abstraction with all functions (aka syscalls) available to the node. #[derive(Clone)] pub struct NodeOs { diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs index 5da7c2c482..9b1a42fd28 100644 --- a/libs/desim/src/options.rs +++ b/libs/desim/src/options.rs @@ -1,4 +1,5 @@ -use rand::{rngs::StdRng, Rng}; +use rand::Rng; +use rand::rngs::StdRng; /// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. /// Connection failure will occur with the probablity fail_prob. diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs index 92a7e8a27d..31bc29e6a6 100644 --- a/libs/desim/src/proto.rs +++ b/libs/desim/src/proto.rs @@ -3,7 +3,8 @@ use std::fmt::Debug; use bytes::Bytes; use utils::lsn::Lsn; -use crate::{network::TCP, world::NodeId}; +use crate::network::TCP; +use crate::world::NodeId; /// Internal node events. #[derive(Debug)] diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7ce605bda8..350d182cc3 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -1,12 +1,8 @@ -use std::{ - cmp::Ordering, - collections::BinaryHeap, - ops::DerefMut, - sync::{ - atomic::{AtomicU32, AtomicU64}, - Arc, - }, -}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, AtomicU64}; use parking_lot::Mutex; use tracing::trace; diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs index 7d60be04b5..576ba89cd7 100644 --- a/libs/desim/src/world.rs +++ b/libs/desim/src/world.rs @@ -1,19 +1,18 @@ +use std::ops::DerefMut; +use std::sync::{Arc, mpsc}; + use parking_lot::Mutex; -use rand::{rngs::StdRng, SeedableRng}; -use std::{ - ops::DerefMut, - sync::{mpsc, Arc}, -}; +use rand::SeedableRng; +use rand::rngs::StdRng; -use crate::{ - executor::{ExternalHandle, Runtime}, - network::NetworkTask, - options::NetworkOptions, - proto::{NodeEvent, SimEvent}, - time::Timing, -}; - -use super::{chan::Chan, network::TCP, node_os::NodeOs}; +use super::chan::Chan; +use super::network::TCP; +use super::node_os::NodeOs; +use crate::executor::{ExternalHandle, Runtime}; +use crate::network::NetworkTask; +use crate::options::NetworkOptions; +use crate::proto::{NodeEvent, SimEvent}; +use crate::time::Timing; pub type NodeId = u32; diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs index cf7bff8f5a..1ddf9844de 100644 --- a/libs/desim/tests/reliable_copy_test.rs +++ b/libs/desim/tests/reliable_copy_test.rs @@ -1,14 +1,15 @@ //! Simple test to verify that simulator is working. #[cfg(test)] mod reliable_copy_test { + use std::sync::Arc; + use anyhow::Result; use desim::executor::{self, PollSome}; + use desim::node_os::NodeOs; use desim::options::{Delay, NetworkOptions}; - use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::proto::{AnyMessage, NetEvent, NodeEvent, ReplCell}; use desim::world::{NodeId, World}; - use desim::{node_os::NodeOs, proto::AnyMessage}; use parking_lot::Mutex; - use std::sync::Arc; use tracing::info; /// Disk storage trait and implementation. diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index d72e4bd012..00b3777a63 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -6,11 +6,9 @@ license.workspace = true [dependencies] anyhow.workspace = true -backtrace.workspace = true bytes.workspace = true -inferno.workspace = true fail.workspace = true -flate2.workspace = true +futures.workspace = true hyper0.workspace = true itertools.workspace = true jemalloc_pprof.workspace = true @@ -24,6 +22,7 @@ serde_path_to_error.workspace = true thiserror.workspace = true tracing.workspace = true tokio.workspace = true +tokio-rustls.workspace = true tokio-util.workspace = true url.workspace = true uuid.workspace = true diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index be97b341d1..5588f6d87e 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,30 +1,28 @@ -use crate::error::{api_error_handler, route_error_handler, ApiError}; -use crate::pprof; -use crate::request::{get_query_param, parse_query_param}; -use ::pprof::protos::Message as _; -use ::pprof::ProfilerGuardBuilder; -use anyhow::{anyhow, Context}; -use bytes::{Bytes, BytesMut}; -use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; -use hyper::http::HeaderValue; -use hyper::Method; -use hyper::{header::CONTENT_TYPE, Body, Request, Response}; -use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; -use once_cell::sync::Lazy; -use regex::Regex; -use routerify::ext::RequestExt; -use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tokio::sync::{mpsc, Mutex, Notify}; -use tokio_stream::wrappers::ReceiverStream; -use tokio_util::io::ReaderStream; -use tracing::{debug, info, info_span, warn, Instrument}; -use utils::auth::{AuthError, Claims, SwappableJwtAuth}; - use std::future::Future; use std::io::Write as _; use std::str::FromStr; use std::time::Duration; +use anyhow::{Context, anyhow}; +use bytes::{Bytes, BytesMut}; +use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName}; +use hyper::http::HeaderValue; +use hyper::{Body, Method, Request, Response}; +use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter}; +use once_cell::sync::Lazy; +use pprof::ProfilerGuardBuilder; +use pprof::protos::Message as _; +use routerify::ext::RequestExt; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{Mutex, Notify, mpsc}; +use tokio_stream::wrappers::ReceiverStream; +use tokio_util::io::ReaderStream; +use tracing::{Instrument, debug, info, info_span, warn}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; + +use crate::error::{ApiError, api_error_handler, route_error_handler}; +use crate::request::{get_query_param, parse_query_param}; + static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -375,7 +373,7 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A Err(_) => { return Err(ApiError::Conflict( "profiler already running (use ?force=true to cancel it)".into(), - )) + )); } } tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait @@ -401,12 +399,10 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A // Return the report in the requested format. match format { Format::Pprof => { - let mut body = Vec::new(); - report + let body = report .pprof() .map_err(|err| ApiError::InternalServerError(err.into()))? - .write_to_vec(&mut body) - .map_err(|err| ApiError::InternalServerError(err.into()))?; + .encode_to_vec(); Response::builder() .status(200) @@ -449,20 +445,6 @@ pub async fn profile_heap_handler(req: Request) -> Result, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; - // Functions and mappings to strip when symbolizing pprof profiles. If true, - // also remove child frames. - static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { - vec![ - (Regex::new("^__rust").unwrap(), false), - (Regex::new("^_start$").unwrap(), false), - (Regex::new("^irallocx_prof").unwrap(), true), - (Regex::new("^prof_alloc_prep").unwrap(), true), - (Regex::new("^std::rt::lang_start").unwrap(), false), - (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), - ] - }); - const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; - // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -495,52 +477,34 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - // Symbolize the profile. - // TODO: consider moving this upstream to jemalloc_pprof and avoiding the - // serialization roundtrip. - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - pprof::encode(&profile) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") - .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"") .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } Format::Svg => { - let body = tokio::task::spawn_blocking(move || { - let bytes = prof_ctl.dump_pprof()?; - let profile = pprof::decode(&bytes)?; - let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); - let mut opts = inferno::flamegraph::Options::default(); - opts.title = "Heap inuse".to_string(); - opts.count_name = "bytes".to_string(); - pprof::flamegraph(profile, &mut opts) - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "image/svg+xml") - .body(Body::from(body)) + .body(Body::from(svg)) .map_err(|err| ApiError::InternalServerError(err.into())) } } } -pub fn add_request_id_middleware( -) -> Middleware { +pub fn add_request_id_middleware() +-> Middleware { Middleware::pre(move |req| async move { let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { Some(request_id) => request_id @@ -664,7 +628,7 @@ pub fn auth_middleware( None => { return Err(ApiError::Unauthorized( "missing authorization header".to_string(), - )) + )); } } } @@ -717,12 +681,14 @@ pub fn check_permission_with( #[cfg(test)] mod tests { - use super::*; - use hyper::service::Service; - use routerify::RequestServiceBuilder; use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; + use hyper::service::Service; + use routerify::RequestServiceBuilder; + + use super::*; + #[tokio::test] async fn test_request_id_returned() { let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); diff --git a/libs/http-utils/src/error.rs b/libs/http-utils/src/error.rs index 746305caec..f790dc26ca 100644 --- a/libs/http-utils/src/error.rs +++ b/libs/http-utils/src/error.rs @@ -1,10 +1,10 @@ -use hyper::{header, Body, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::error::Error as StdError; + +use hyper::{Body, Response, StatusCode, header}; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tracing::{error, info, warn}; - use utils::auth::AuthError; #[derive(Debug, Error)] diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs index 8a1e0c8cf0..984823f4a9 100644 --- a/libs/http-utils/src/failpoints.rs +++ b/libs/http-utils/src/failpoints.rs @@ -1,12 +1,11 @@ -use crate::error::ApiError; -use crate::json::{json_request, json_response}; - use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; - use utils::failpoint_support::apply_failpoint; +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + pub type ConfigureFailpointsRequest = Vec; /// Information for configuring a single fail point diff --git a/libs/http-utils/src/json.rs b/libs/http-utils/src/json.rs index e53231f313..14ebac91e6 100644 --- a/libs/http-utils/src/json.rs +++ b/libs/http-utils/src/json.rs @@ -1,6 +1,6 @@ use anyhow::Context; use bytes::Buf; -use hyper::{header, Body, Request, Response, StatusCode}; +use hyper::{Body, Request, Response, StatusCode, header}; use serde::{Deserialize, Serialize}; use super::error::ApiError; diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index ae6a27aaa8..dd520ef69b 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -2,11 +2,11 @@ pub mod endpoint; pub mod error; pub mod failpoints; pub mod json; -pub mod pprof; pub mod request; +pub mod server; extern crate hyper0 as hyper; /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. -pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; +pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt}; diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs deleted file mode 100644 index fe1cc10838..0000000000 --- a/libs/http-utils/src/pprof.rs +++ /dev/null @@ -1,238 +0,0 @@ -use anyhow::bail; -use flate2::write::{GzDecoder, GzEncoder}; -use flate2::Compression; -use itertools::Itertools as _; -use pprof::protos::{Function, Line, Location, Message as _, Profile}; -use regex::Regex; - -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; -use std::ffi::c_void; -use std::io::Write as _; - -/// Decodes a gzip-compressed Protobuf-encoded pprof profile. -pub fn decode(bytes: &[u8]) -> anyhow::Result { - let mut gz = GzDecoder::new(Vec::new()); - gz.write_all(bytes)?; - Ok(Profile::parse_from_bytes(&gz.finish()?)?) -} - -/// Encodes a pprof profile as gzip-compressed Protobuf. -pub fn encode(profile: &Profile) -> anyhow::Result> { - let mut gz = GzEncoder::new(Vec::new(), Compression::default()); - profile.write_to_writer(&mut gz)?; - Ok(gz.finish()?) -} - -/// Symbolizes a pprof profile using the current binary. -pub fn symbolize(mut profile: Profile) -> anyhow::Result { - if !profile.function.is_empty() { - return Ok(profile); // already symbolized - } - - // Collect function names. - let mut functions: HashMap = HashMap::new(); - let mut strings: HashMap = profile - .string_table - .into_iter() - .enumerate() - .map(|(i, s)| (s, i as i64)) - .collect(); - - // Helper to look up or register a string. - let mut string_id = |s: &str| -> i64 { - // Don't use .entry() to avoid unnecessary allocations. - if let Some(id) = strings.get(s) { - return *id; - } - let id = strings.len() as i64; - strings.insert(s.to_string(), id); - id - }; - - for loc in &mut profile.location { - if !loc.line.is_empty() { - continue; - } - - // Resolve the line and function for each location. - backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symbol_name) = symbol.name() else { - return; - }; - - let function_name = format!("{symbol_name:#}"); - let functions_len = functions.len(); - let function_id = functions - .entry(function_name) - .or_insert_with_key(|function_name| { - let function_id = functions_len as u64 + 1; - let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); - let filename = symbol - .filename() - .map(|path| path.to_string_lossy()) - .unwrap_or(Cow::Borrowed("")); - Function { - id: function_id, - name: string_id(function_name), - system_name: string_id(&system_name), - filename: string_id(&filename), - ..Default::default() - } - }) - .id; - loc.line.push(Line { - function_id, - line: symbol.lineno().unwrap_or(0) as i64, - ..Default::default() - }); - }); - } - - // Store the resolved functions, and mark the mapping as resolved. - profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); - profile.string_table = strings - .into_iter() - .sorted_by_key(|(_, i)| *i) - .map(|(s, _)| s) - .collect(); - - for mapping in &mut profile.mapping { - mapping.has_functions = true; - mapping.has_filenames = true; - } - - Ok(profile) -} - -/// Strips locations (stack frames) matching the given mappings (substring) or function names -/// (regex). The function bool specifies whether child frames should be stripped as well. -/// -/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all -/// string references. -pub fn strip_locations( - mut profile: Profile, - mappings: &[&str], - functions: &[(Regex, bool)], -) -> Profile { - // Strip mappings. - let mut strip_mappings: HashSet = HashSet::new(); - - profile.mapping.retain(|mapping| { - let Some(name) = profile.string_table.get(mapping.filename as usize) else { - return true; - }; - if mappings.iter().any(|substr| name.contains(substr)) { - strip_mappings.insert(mapping.id); - return false; - } - true - }); - - // Strip functions. - let mut strip_functions: HashMap = HashMap::new(); - - profile.function.retain(|function| { - let Some(name) = profile.string_table.get(function.name as usize) else { - return true; - }; - for (regex, strip_children) in functions { - if regex.is_match(name) { - strip_functions.insert(function.id, *strip_children); - return false; - } - } - true - }); - - // Strip locations. The bool specifies whether child frames should be stripped too. - let mut strip_locations: HashMap = HashMap::new(); - - profile.location.retain(|location| { - for line in &location.line { - if let Some(strip_children) = strip_functions.get(&line.function_id) { - strip_locations.insert(location.id, *strip_children); - return false; - } - } - if strip_mappings.contains(&location.mapping_id) { - strip_locations.insert(location.id, false); - return false; - } - true - }); - - // Strip sample locations. - for sample in &mut profile.sample { - // First, find the uppermost function with child removal and truncate the stack. - if let Some(truncate) = sample - .location_id - .iter() - .rposition(|id| strip_locations.get(id) == Some(&true)) - { - sample.location_id.drain(..=truncate); - } - // Next, strip any individual frames without child removal. - sample - .location_id - .retain(|id| !strip_locations.contains_key(id)); - } - - profile -} - -/// Generates an SVG flamegraph from a symbolized pprof profile. -pub fn flamegraph( - profile: Profile, - opts: &mut inferno::flamegraph::Options, -) -> anyhow::Result> { - if profile.mapping.iter().any(|m| !m.has_functions) { - bail!("profile not symbolized"); - } - - // Index locations, functions, and strings. - let locations: HashMap = - profile.location.into_iter().map(|l| (l.id, l)).collect(); - let functions: HashMap = - profile.function.into_iter().map(|f| (f.id, f)).collect(); - let strings = profile.string_table; - - // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, - // since inferno expects it bottom-up. - let mut stacks: HashMap, i64> = HashMap::new(); - for sample in profile.sample { - let mut stack = Vec::with_capacity(sample.location_id.len()); - for location in sample.location_id.into_iter().rev() { - let Some(location) = locations.get(&location) else { - bail!("missing location {location}"); - }; - for line in location.line.iter().rev() { - let Some(function) = functions.get(&line.function_id) else { - bail!("missing function {}", line.function_id); - }; - let Some(name) = strings.get(function.name as usize) else { - bail!("missing string {}", function.name); - }; - stack.push(name.as_str()); - } - } - let Some(&value) = sample.value.first() else { - bail!("missing value"); - }; - *stacks.entry(stack).or_default() += value; - } - - // Construct stack lines for inferno. - let lines = stacks - .into_iter() - .map(|(stack, value)| (stack.into_iter().join(";"), value)) - .map(|(stack, value)| format!("{stack} {value}")) - .sorted() - .collect_vec(); - - // Construct the flamegraph. - let mut bytes = Vec::new(); - let lines = lines.iter().map(|line| line.as_str()); - inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; - Ok(bytes) -} diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 7ea71685ec..9024a90a82 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -1,10 +1,13 @@ use core::fmt; -use std::{borrow::Cow, str::FromStr}; +use std::borrow::Cow; +use std::str::FromStr; + +use anyhow::anyhow; +use hyper::body::HttpBody; +use hyper::{Body, Request}; +use routerify::ext::RequestExt; use super::error::ApiError; -use anyhow::anyhow; -use hyper::{body::HttpBody, Body, Request}; -use routerify::ext::RequestExt; pub fn get_request_param<'a>( request: &'a Request, diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs new file mode 100644 index 0000000000..33e4915e99 --- /dev/null +++ b/libs/http-utils/src/server.rs @@ -0,0 +1,155 @@ +use std::{error::Error, sync::Arc}; + +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use hyper0::Body; +use hyper0::server::conn::Http; +use routerify::{RequestService, RequestServiceBuilder}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; +use tracing::{error, info}; + +use crate::error::ApiError; + +/// A simple HTTP server over hyper library. +/// You may want to use it instead of [`hyper0::server::Server`] because: +/// 1. hyper0's Server was removed from hyper v1. +/// It's recommended to replace hyepr0's Server with a manual loop, which is done here. +/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way +/// to support it efficiently with the Accept trait that hyper0's Server uses. +/// That's one of the reasons why it was removed from v1. +/// +pub struct Server { + request_service: Arc>, + listener: tokio::net::TcpListener, + tls_acceptor: Option, +} + +impl Server { + pub fn new( + request_service: Arc>, + listener: std::net::TcpListener, + tls_acceptor: Option, + ) -> anyhow::Result { + // Note: caller of from_std is responsible for setting nonblocking mode. + listener.set_nonblocking(true)?; + let listener = tokio::net::TcpListener::from_std(listener)?; + + Ok(Self { + request_service, + listener, + tls_acceptor, + }) + } + + pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> { + fn suppress_io_error(err: &std::io::Error) -> bool { + use std::io::ErrorKind::*; + matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe) + } + fn suppress_hyper_error(err: &hyper0::Error) -> bool { + if err.is_incomplete_message() || err.is_closed() || err.is_timeout() { + return true; + } + if let Some(inner) = err.source() { + if let Some(io) = inner.downcast_ref::() { + return suppress_io_error(io); + } + } + false + } + + let mut connections = FuturesUnordered::new(); + loop { + tokio::select! { + stream = self.listener.accept() => { + let (tcp_stream, remote_addr) = match stream { + Ok(stream) => stream, + Err(err) => { + if !suppress_io_error(&err) { + info!("Failed to accept TCP connection: {err:#}"); + } + continue; + } + }; + + let service = self.request_service.build(remote_addr); + let tls_acceptor = self.tls_acceptor.clone(); + let cancel = cancel.clone(); + + connections.push(tokio::spawn( + async move { + match tls_acceptor { + Some(tls_acceptor) => { + // Handle HTTPS connection. + let tls_stream = tokio::select! { + tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, + _ = cancel.cancelled() => return, + }; + let tls_stream = match tls_stream { + Ok(tls_stream) => tls_stream, + Err(err) => { + if !suppress_io_error(&err) { + info!("Failed to accept TLS connection: {err:#}"); + } + return; + } + }; + if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + if !suppress_hyper_error(&err) { + info!("Failed to serve HTTPS connection: {err:#}"); + } + } + } + None => { + // Handle HTTP connection. + if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + if !suppress_hyper_error(&err) { + info!("Failed to serve HTTP connection: {err:#}"); + } + } + } + }; + })); + } + Some(conn) = connections.next() => { + if let Err(err) = conn { + error!("Connection panicked: {err:#}"); + } + } + _ = cancel.cancelled() => { + // Wait for graceful shutdown of all connections. + while let Some(conn) = connections.next().await { + if let Err(err) = conn { + error!("Connection panicked: {err:#}"); + } + } + break; + } + } + } + Ok(()) + } + + /// Serves HTTP connection with graceful shutdown. + async fn serve_connection( + io: I, + service: RequestService, + cancel: CancellationToken, + ) -> Result<(), hyper0::Error> + where + I: AsyncRead + AsyncWrite + Unpin + Send + 'static, + { + let mut conn = Http::new().serve_connection(io, service).with_upgrades(); + + tokio::select! { + res = &mut conn => res, + _ = cancel.cancelled() => { + Pin::new(&mut conn).graceful_shutdown(); + // Note: connection should still be awaited for graceful shutdown to complete. + conn.await + } + } + } +} diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index 723916a742..93f6a2b7cc 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -6,17 +6,15 @@ //! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, //! use significantly less memory than this, but can only approximate the cardinality. -use std::{ - hash::{BuildHasher, BuildHasherDefault, Hash}, - sync::atomic::AtomicU8, -}; +use std::hash::{BuildHasher, BuildHasherDefault, Hash}; +use std::sync::atomic::AtomicU8; -use measured::{ - label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, - metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, - text::TextEncoder, - LabelGroup, -}; +use measured::LabelGroup; +use measured::label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}; +use measured::metric::counter::CounterState; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{Metric, MetricType, MetricVec}; +use measured::text::TextEncoder; use twox_hash::xxh3; /// Create an [`HyperLogLogVec`] and registers to default registry. @@ -27,9 +25,7 @@ macro_rules! register_hll_vec { $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) }}; - ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ - $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) - }}; + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) }}; } /// Create an [`HyperLogLog`] and registers to default registry. @@ -40,9 +36,7 @@ macro_rules! register_hll { $crate::register(Box::new(hll.clone())).map(|_| hll) }}; - ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ - $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) - }}; + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) }}; } /// HLL is a probabilistic cardinality measure. @@ -195,8 +189,10 @@ impl measured::metric::MetricEncoding); diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 0f6c2a0937..4df8d7bc51 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,38 +4,26 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] -use measured::{ - label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, - metric::{ - counter::CounterState, - gauge::GaugeState, - group::Encoding, - name::{MetricName, MetricNameEncoder}, - MetricEncoding, MetricFamilyEncoding, - }, - FixedCardinalityLabel, LabelGroup, MetricGroup, -}; +use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}; +use measured::metric::counter::CounterState; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::{MetricName, MetricNameEncoder}; +use measured::metric::{MetricEncoding, MetricFamilyEncoding}; +use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup}; use once_cell::sync::Lazy; +use prometheus::Registry; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, }; pub use prometheus::local::LocalHistogram; -pub use prometheus::opts; -pub use prometheus::register; -pub use prometheus::Error; -use prometheus::Registry; -pub use prometheus::{core, default_registry, proto}; -pub use prometheus::{exponential_buckets, linear_buckets}; -pub use prometheus::{register_counter_vec, Counter, CounterVec}; -pub use prometheus::{register_gauge, Gauge}; -pub use prometheus::{register_gauge_vec, GaugeVec}; -pub use prometheus::{register_histogram, Histogram}; -pub use prometheus::{register_histogram_vec, HistogramVec}; -pub use prometheus::{register_int_counter, IntCounter}; -pub use prometheus::{register_int_counter_vec, IntCounterVec}; -pub use prometheus::{register_int_gauge, IntGauge}; -pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; -pub use prometheus::{Encoder, TextEncoder}; +pub use prometheus::{ + Counter, CounterVec, Encoder, Error, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterVec, IntGauge, IntGaugeVec, TextEncoder, core, default_registry, exponential_buckets, + linear_buckets, opts, proto, register, register_counter_vec, register_gauge, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, +}; pub mod launch_timestamp; mod wrappers; diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 039cc1319e..ce7de1e0c7 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -35,6 +35,7 @@ pub struct NodeMetadata { pub postgres_port: u16, pub http_host: String, pub http_port: u16, + pub https_port: Option, // Deployment tools may write fields to the metadata file beyond what we // use in this type: this type intentionally only names fields that require. @@ -57,6 +58,9 @@ pub struct ConfigToml { // types mapped 1:1 into the runtime PageServerConfig type pub listen_pg_addr: String, pub listen_http_addr: String, + pub listen_https_addr: Option, + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, pub availability_zone: Option, #[serde(with = "humantime_serde")] pub wait_lsn_timeout: Duration, @@ -123,6 +127,10 @@ pub struct ConfigToml { pub enable_read_path_debugging: Option, #[serde(skip_serializing_if = "Option::is_none")] pub validate_wal_contiguity: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub load_previous_heatmap: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub generate_unarchival_heatmap: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -417,6 +425,9 @@ pub mod defaults { pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = utils::postgres_client::PostgresClientProtocol::Vanilla; + + pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; + pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } impl Default for ConfigToml { @@ -426,6 +437,9 @@ impl Default for ConfigToml { Self { listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + listen_https_addr: (None), + ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE), + ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE), availability_zone: (None), wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), @@ -523,6 +537,8 @@ impl Default for ConfigToml { None }, validate_wal_contiguity: None, + load_previous_heatmap: None, + generate_unarchival_heatmap: None, } } } diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs index edeefc156e..9e61873273 100644 --- a/libs/pageserver_api/src/config/tests.rs +++ b/libs/pageserver_api/src/config/tests.rs @@ -16,6 +16,30 @@ fn test_node_metadata_v1_backward_compatibilty() { postgres_port: 23, http_host: "localhost".to_string(), http_port: 42, + https_port: None, + other: HashMap::new(), + } + ) +} + +#[test] +fn test_node_metadata_v2_backward_compatibilty() { + let v2 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + "https_port": 123, + })); + + assert_eq!( + serde_json::from_slice::(&v2.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + https_port: Some(123), other: HashMap::new(), } ) diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 2cfe1a85f9..3cb62f9d18 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -182,20 +182,66 @@ pub struct TenantDescribeResponseShard { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { pub node_id: NodeId, + + /// Optionally, callers may specify the node they are migrating _from_, and the server will + /// reject the request if the shard is no longer attached there: this enables writing safer + /// clients that don't risk fighting with some other movement of the shard. #[serde(default)] - pub migration_config: Option, + pub origin_node_id: Option, + + #[serde(default)] + pub migration_config: MigrationConfig, } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] pub struct MigrationConfig { + /// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling + /// score: this is usually not what you want, and if you use this then you'll also need to set the + /// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration. + /// + /// Default: false + #[serde(default)] + pub override_scheduler: bool, + + /// If true, the migration will be done gracefully by creating a secondary location first and + /// waiting for it to warm up before cutting over. If false, if there is no existing secondary + /// location at the destination, the tenant will be migrated immediately. If the tenant's data + /// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go + /// ahead but run with a cold cache that can severely reduce performance until it warms up. + /// + /// When doing a graceful migration, the migration API returns as soon as it is started. + /// + /// Default: true + #[serde(default = "default_prewarm")] + pub prewarm: bool, + + /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait + /// overall for secondary warmup before cutting over #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_warmup_timeout: Option, + /// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait + /// within each secondary download poll call to pageserver. #[serde(default)] #[serde(with = "humantime_serde")] pub secondary_download_request_timeout: Option, } +fn default_prewarm() -> bool { + true +} + +impl Default for MigrationConfig { + fn default() -> Self { + Self { + override_scheduler: false, + prewarm: default_prewarm(), + secondary_warmup_timeout: None, + secondary_download_request_timeout: None, + } + } +} + #[derive(Serialize, Clone, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { @@ -443,6 +489,7 @@ pub struct SafekeeperDescribeResponse { pub host: String, pub port: i32, pub http_port: i32, + pub https_port: Option, pub availability_zone_id: String, pub scheduling_policy: SkSchedulingPolicy, } @@ -487,4 +534,43 @@ mod test { err ); } + + /// Check that a minimal migrate request with no config results in the expected default settings + #[test] + fn test_migrate_request_decode_defaults() { + let json = r#"{ + "node_id": 123 + }"#; + + let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap(); + assert_eq!(request.node_id, NodeId(123)); + assert_eq!(request.origin_node_id, None); + assert!(!request.migration_config.override_scheduler); + assert!(request.migration_config.prewarm); + assert_eq!(request.migration_config.secondary_warmup_timeout, None); + assert_eq!( + request.migration_config.secondary_download_request_timeout, + None + ); + } + + /// Check that a partially specified migration config results in the expected default settings + #[test] + fn test_migration_config_decode_defaults() { + // Specify just one field of the config + let json = r#"{ + }"#; + + let config: MigrationConfig = serde_json::from_str(json).unwrap(); + + // Check each field's expected default value + assert!(!config.override_scheduler); + assert!(config.prewarm); + assert_eq!(config.secondary_warmup_timeout, None); + assert_eq!(config.secondary_download_request_timeout, None); + assert_eq!(config.secondary_warmup_timeout, None); + + // Consistency check that the Default impl agrees with our serde defaults + assert_eq!(MigrationConfig::default(), config); + } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ea565e7769..13a9b5d89e 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -274,6 +274,31 @@ pub struct TimelineCreateRequest { pub mode: TimelineCreateRequestMode, } +/// Storage controller specific extensions to [`TimelineInfo`]. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineCreateResponseStorcon { + #[serde(flatten)] + pub timeline_info: TimelineInfo, + + pub safekeepers: Option, +} + +/// Safekeepers as returned in timeline creation request to storcon or pushed to +/// cplane in the post migration hook. +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeepersInfo { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + #[derive(Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum TimelineCreateRequestMode { @@ -1146,6 +1171,15 @@ pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct TimelinePatchIndexPartRequest { + pub rel_size_migration: Option, + pub gc_compaction_last_completed_lsn: Option, + pub applied_gc_cutoff_lsn: Option, + #[serde(default)] + pub force_index_update: bool, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelinesInfoAndOffloaded { pub timelines: Vec, @@ -1165,6 +1199,21 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -1243,7 +1292,11 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. + /// Whether the timeline is archived. pub is_archived: Option, + + /// The status of the rel_size migration. + pub rel_size_migration: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index f74b229ac4..a0a891f0dc 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -4,28 +4,28 @@ //! is rather narrow, but we can extend it once required. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::Context; -use bytes::Bytes; -use serde::{Deserialize, Serialize}; +use std::future::Future; use std::io::ErrorKind; use std::net::SocketAddr; -use std::os::fd::AsRawFd; -use std::os::fd::RawFd; +use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; +use std::str::FromStr; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; use std::{fmt, io}; -use std::{future::Future, str::FromStr}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_rustls::TlsAcceptor; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn}; +use anyhow::Context; +use bytes::Bytes; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN, SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION, }; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, trace, warn}; /// An error, occurred during query processing: /// either during the connection ([`ConnectionError`]) or before/after it. @@ -746,7 +746,7 @@ impl PostgresBackend { match e { QueryError::Shutdown => return Ok(ProcessMsgResult::Break), QueryError::SimulatedConnectionError => { - return Err(QueryError::SimulatedConnectionError) + return Err(QueryError::SimulatedConnectionError); } err @ QueryError::Reconnect => { // Instruct the client to reconnect, stop processing messages @@ -1020,7 +1020,9 @@ fn log_query_error(query: &str, e: &QueryError) { } } QueryError::Disconnected(other_connection_error) => { - error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + error!( + "query handler for '{query}' failed with connection error: {other_connection_error:?}" + ) } QueryError::SimulatedConnectionError => { error!("query handler for query '{query}' failed due to a simulated connection error") diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 3fcfbf4a03..907ef9eed3 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -1,10 +1,11 @@ +use std::io::Cursor; +use std::sync::Arc; + /// Test postgres_backend_async with tokio_postgres use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use rustls::crypto::ring; -use std::io::Cursor; -use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index e3d31c6cfc..cd981b3729 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -1,9 +1,10 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use anyhow::{bail, Context}; -use itertools::Itertools; use std::borrow::Cow; use std::fmt; + +use anyhow::{Context, bail}; +use itertools::Itertools; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. @@ -29,9 +30,10 @@ pub fn parse_host_port>(host_port: S) -> Result<(Host, Option #[cfg(test)] mod tests_parse_host_port { - use crate::parse_host_port; use url::Host; + use crate::parse_host_port; + #[test] fn test_normal() { let (host, port) = parse_host_port("hello:123").unwrap(); @@ -207,10 +209,11 @@ impl fmt::Debug for PgConnectionConfig { #[cfg(test)] mod tests_pg_connection_config { - use crate::PgConnectionConfig; use once_cell::sync::Lazy; use url::Host; + use crate::PgConnectionConfig; + static STUB_HOST: Lazy = Lazy::new(|| Host::Domain("stub.host.example".to_owned())); #[test] diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs index c8cf0d322a..2e1d62e452 100644 --- a/libs/postgres_ffi/benches/waldecoder.rs +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -1,6 +1,6 @@ use std::ffi::CStr; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index d3a85f2683..cdebd43f6f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,7 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 301bc2f16e..05d8de4c7a 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -21,7 +21,9 @@ macro_rules! postgres_ffi { pub mod bindings { // bindgen generates bindings for a lot of stuff we don't need #![allow(dead_code)] + #![allow(unsafe_op_in_unsafe_fn)] #![allow(clippy::undocumented_unsafe_blocks)] + #![allow(clippy::ptr_offset_with_cast)] use serde::{Deserialize, Serialize}; include!(concat!( @@ -43,8 +45,7 @@ macro_rules! postgres_ffi { pub const PG_MAJORVERSION: &str = stringify!($version); // Re-export some symbols from bindings - pub use bindings::DBState_DB_SHUTDOWNED; - pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + pub use bindings::{CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, XLogRecord}; pub const ZERO_CHECKPOINT: bytes::Bytes = bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); @@ -221,21 +222,17 @@ pub mod relfile_utils; pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions -pub use v14::bindings::RepOriginId; -pub use v14::bindings::{uint32, uint64, Oid}; -pub use v14::bindings::{BlockNumber, OffsetNumber}; -pub use v14::bindings::{MultiXactId, TransactionId}; -pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; - +pub use v14::bindings::{ + BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData, + RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, + uint64, +}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; -pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{ XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; -pub use v14::bindings::{CheckPoint, ControlFileData}; - // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -246,13 +243,11 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod -pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::get_current_timestamp; -pub use v14::xlog_utils::to_pg_timestamp; -pub use v14::xlog_utils::try_from_pg_timestamp; -pub use v14::xlog_utils::XLogFileName; - pub use v14::bindings::DBState_DB_SHUTDOWNED; +pub use v14::xlog_utils::{ + XLogFileName, encode_logical_message, get_current_timestamp, to_pg_timestamp, + try_from_pg_timestamp, +}; pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) @@ -355,8 +350,9 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { } pub mod waldecoder { - use bytes::{Buf, Bytes, BytesMut}; use std::num::NonZeroU32; + + use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; @@ -400,6 +396,14 @@ pub mod waldecoder { self.lsn + self.inputbuf.remaining() as u64 } + /// Returns the LSN up to which the WAL decoder has processed. + /// + /// If [`Self::poll_decode`] returned a record, then this will return + /// the end LSN of said record. + pub fn lsn(&self) -> Lsn { + self.lsn + } + pub fn feed_bytes(&mut self, buf: &[u8]) { self.inputbuf.extend_from_slice(buf); } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index e343473d77..b0bdd8a8da 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -9,8 +9,7 @@ //! comments on them. //! -use crate::PageHeaderData; -use crate::BLCKSZ; +use crate::{BLCKSZ, PageHeaderData}; // // From pg_tablespace_d.h diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index fce37e2fdd..1ccf4590a9 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -3,18 +3,16 @@ //! //! TODO: Generate separate types for each supported PG version -use crate::pg_constants; -use crate::XLogRecord; -use crate::{ - BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz, - TransactionId, -}; -use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD}; use bytes::{Buf, Bytes}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; +use crate::{ + BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, + TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, +}; + #[repr(C)] #[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { @@ -508,9 +506,10 @@ pub fn decode_wal_record( } pub mod v14 { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlHeapInsert { @@ -678,9 +677,10 @@ pub mod v15 { } pub mod v16 { + use bytes::{Buf, Bytes}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use crate::{OffsetNumber, TransactionId}; - use bytes::{Buf, Bytes}; pub struct XlHeapDelete { pub xmax: TransactionId, @@ -746,9 +746,10 @@ pub mod v16 { /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ pub mod rm_neon { - use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; + use crate::{OffsetNumber, TransactionId}; + #[repr(C)] #[derive(Debug)] pub struct XlNeonHeapInsert { @@ -858,14 +859,14 @@ pub mod v16 { } pub mod v17 { - pub use super::v14::XlHeapLockUpdated; - pub use crate::{TimeLineID, TimestampTz}; use bytes::{Buf, Bytes}; - pub use super::v16::rm_neon; + pub use super::v14::XlHeapLockUpdated; pub use super::v16::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + rm_neon, }; + pub use crate::{TimeLineID, TimestampTz}; #[repr(C)] #[derive(Debug)] diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 41afcea6c2..6151ce34ac 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -1,7 +1,9 @@ +use std::path::PathBuf; +use std::str::FromStr; + use anyhow::*; -use clap::{value_parser, Arg, ArgMatches, Command}; +use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; -use std::{path::PathBuf, str::FromStr}; use wal_craft::*; fn main() -> Result<()> { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 77dff4ac99..ca9530faef 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,17 +1,18 @@ -use anyhow::{bail, ensure}; -use camino_tempfile::{tempdir, Utf8TempDir}; -use log::*; -use postgres::types::PgLsn; -use postgres::Client; -use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{ - XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, -}; use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; +use anyhow::{bail, ensure}; +use camino_tempfile::{Utf8TempDir, tempdir}; +use log::*; +use postgres::Client; +use postgres::types::PgLsn; +use postgres_ffi::{ + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, + XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; + macro_rules! xlog_utils_test { ($version:ident) => { #[path = "."] diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index ccbb90e384..8e216d0f44 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -10,11 +10,10 @@ //! calls. //! //! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 +use std::future::Future; +use std::io::{self, ErrorKind}; + use bytes::{Buf, BytesMut}; -use std::{ - future::Future, - io::{self, ErrorKind}, -}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index f99128b76a..e435ffbf7e 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -5,14 +5,15 @@ pub mod framed; +use std::borrow::Cow; +use std::{fmt, io, str}; + use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use std::{borrow::Cow, fmt, io, str}; - // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; +use serde::{Deserialize, Serialize}; pub type Oid = u32; pub type SystemId = u64; @@ -206,8 +207,8 @@ use rand::distributions::{Distribution, Standard}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> CancelKeyData { CancelKeyData { - backend_pid: rng.gen(), - cancel_key: rng.gen(), + backend_pid: rng.r#gen(), + cancel_key: rng.r#gen(), } } } @@ -1035,7 +1036,7 @@ impl BeMessage<'_> { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol - // dependency + // dependency buf.put_u64(rec.streaming_lsn); buf.put_u64(rec.commit_lsn); buf.put_slice(rec.data); diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index b65fb571e6..0bdad0b554 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -34,8 +34,13 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - let socket = - connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?; + let socket = connect_socket::connect_socket( + config.host_addr, + &config.host, + config.port, + config.connect_timeout, + ) + .await?; cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 39b1db75da..c70cb598de 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::net::IpAddr; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; @@ -137,6 +138,7 @@ impl InnerClient { #[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { + pub host_addr: Option, pub host: Host, pub port: u16, pub connect_timeout: Option, diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 4c25491b67..978d348741 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,5 +1,6 @@ //! Connection configuration. +use std::net::IpAddr; use std::time::Duration; use std::{fmt, str}; @@ -65,6 +66,7 @@ pub enum AuthKeys { /// Connection configuration. #[derive(Clone, PartialEq, Eq)] pub struct Config { + pub(crate) host_addr: Option, pub(crate) host: Host, pub(crate) port: u16, @@ -83,6 +85,7 @@ impl Config { /// Creates a new configuration. pub fn new(host: String, port: u16) -> Config { Config { + host_addr: None, host: Host::Tcp(host), port, password: None, @@ -163,6 +166,15 @@ impl Config { self } + pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config { + self.host_addr = Some(addr); + self + } + + pub fn get_host_addr(&self) -> Option { + self.host_addr + } + /// Sets the SSL configuration. /// /// Defaults to `prefer`. diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index d2bd0dfbcd..7c3a358bba 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,5 @@ +use std::net::IpAddr; + use postgres_protocol2::message::backend::Message; use tokio::net::TcpStream; use tokio::sync::mpsc; @@ -25,13 +27,14 @@ where .make_tls_connect(hostname) .map_err(|e| Error::tls(e.into()))?; - match connect_once(&config.host, config.port, tls, config).await { + match connect_once(config.host_addr, &config.host, config.port, tls, config).await { Ok((client, connection)) => Ok((client, connection)), Err(e) => Err(e), } } async fn connect_once( + host_addr: Option, host: &Host, port: u16, tls: T, @@ -40,7 +43,7 @@ async fn connect_once( where T: TlsConnect, { - let socket = connect_socket(host, port, config.connect_timeout).await?; + let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; let RawConnection { stream, parameters, @@ -50,6 +53,7 @@ where } = connect_raw(socket, tls, config).await?; let socket_config = SocketConfig { + host_addr, host: host.clone(), port, connect_timeout: config.connect_timeout, diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 15411f7ef3..8c7d300451 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,5 +1,6 @@ use std::future::Future; use std::io; +use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use tokio::net::{self, TcpStream}; @@ -9,15 +10,20 @@ use crate::Error; use crate::config::Host; pub(crate) async fn connect_socket( + host_addr: Option, host: &Host, port: u16, connect_timeout: Option, ) -> Result { match host { Host::Tcp(host) => { - let addrs = net::lookup_host((&**host, port)) - .await - .map_err(Error::connect)?; + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => net::lookup_host((&**host, port)) + .await + .map_err(Error::connect)? + .collect(), + }; let mut last_err = None; diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 4ccdd491b0..3d4d17096e 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -85,12 +85,12 @@ impl MemberSet { Ok(MemberSet { m: members }) } - pub fn contains(&self, sk: &SafekeeperId) -> bool { - self.m.iter().any(|m| m.id == sk.id) + pub fn contains(&self, sk: NodeId) -> bool { + self.m.iter().any(|m| m.id == sk) } pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { - if self.contains(&sk) { + if self.contains(sk.id) { bail!(format!( "sk {} is already member of the set {}", sk.id, self @@ -130,6 +130,19 @@ impl Configuration { new_members: None, } } + + pub fn new(members: MemberSet) -> Self { + Configuration { + generation: INITIAL_GENERATION, + members, + new_members: None, + } + } + + /// Is `sk_id` member of the configuration? + pub fn contains(&self, sk_id: NodeId) -> bool { + self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id)) + } } impl Display for Configuration { diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 2f2aeaa429..10c703395f 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -18,7 +18,7 @@ pub struct SafekeeperStatus { pub id: NodeId, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -283,7 +283,7 @@ pub struct SafekeeperUtilization { } /// pull_timeline request body. -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize)] pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index be00562219..d54876ba2c 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -130,11 +130,7 @@ impl StorageModel { break; } } - if possible { - Some(snapshot_later) - } else { - None - } + if possible { Some(snapshot_later) } else { None } } else { None }; diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index 25ebb1c3d8..a3bc937f52 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -76,7 +76,10 @@ pub fn draw_svg( let mut result = String::new(); - writeln!(result, "")?; + writeln!( + result, + "" + )?; draw.calculate_svg_layout(); diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index 2168beee88..8560d0718c 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -1,8 +1,8 @@ //! Tracing wrapper for Hyper HTTP server -use hyper0::HeaderMap; -use hyper0::{Body, Request, Response}; use std::future::Future; + +use hyper0::{Body, HeaderMap, Request, Response}; use tracing::Instrument; use tracing_opentelemetry::OpenTelemetrySpanExt; diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 818d759eac..72f94d61e4 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -36,11 +36,11 @@ pub mod http; -use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; +use opentelemetry::trace::TracerProvider; use tracing::Subscriber; -use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; +use tracing_subscriber::registry::LookupSpan; /// Set up OpenTelemetry exporter, using configuration from environment variables. /// diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 5020d82adf..ac44300a51 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,7 +15,6 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true -backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 348e27ac47..12c620ec87 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,6 +1,6 @@ use std::time::Duration; -use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use pprof::criterion::{Output, PProfProfiler}; use utils::id; use utils::logging::log_slow; diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 4bfd0ab055..cc5b0b1d13 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,12 +1,15 @@ // For details about authentication see docs/authentication.md -use arc_swap::ArcSwap; -use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; +use std::borrow::Cow; +use std::fmt::Display; +use std::fs; +use std::sync::Arc; use anyhow::Result; +use arc_swap::ArcSwap; use camino::Utf8Path; use jsonwebtoken::{ - decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, + Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode, }; use serde::{Deserialize, Serialize}; @@ -129,7 +132,9 @@ impl JwtAuth { anyhow::bail!("path is neither a directory or a file") }; if decoding_keys.is_empty() { - anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected."); + anyhow::bail!( + "Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected." + ); } Ok(Self::new(decoding_keys)) } @@ -175,9 +180,10 @@ pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use super::*; + // Generated with: // // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem @@ -215,7 +221,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims; assert_eq!(claims_from_token, expected_claims); } @@ -230,7 +238,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap(); // decode it back - let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); + let auth = JwtAuth::new(vec![ + DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(), + ]); let decoded = auth.decode(&encoded).unwrap(); assert_eq!(decoded.claims, claims); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index e6503fe377..4a4c4eedbb 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -121,10 +121,12 @@ where #[cfg(test)] mod tests { - use super::*; use std::io; + use tokio::sync::Mutex; + use super::*; + #[test] fn backoff_defaults_produce_growing_backoff_sequence() { let mut current_backoff_value = None; diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 4d173d0726..2861baeee5 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -13,9 +13,11 @@ #![warn(missing_docs)] -use bincode::Options; -use serde::{de::DeserializeOwned, Serialize}; use std::io::{self, Read, Write}; + +use bincode::Options; +use serde::Serialize; +use serde::de::DeserializeOwned; use thiserror::Error; /// An error that occurred during a deserialize operation @@ -261,10 +263,12 @@ impl LeSer for T {} #[cfg(test)] mod tests { - use super::DeserializeError; - use serde::{Deserialize, Serialize}; use std::io::Cursor; + use serde::{Deserialize, Serialize}; + + use super::DeserializeError; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs index e1ddfd8650..46a6584d66 100644 --- a/libs/utils/src/circuit_breaker.rs +++ b/libs/utils/src/circuit_breaker.rs @@ -1,7 +1,5 @@ -use std::{ - fmt::Display, - time::{Duration, Instant}, -}; +use std::fmt::Display; +use std::time::{Duration, Instant}; use metrics::IntCounter; diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index f65c080ad4..973d754715 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -1,4 +1,5 @@ -use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; +use tokio_util::task::TaskTracker; +use tokio_util::task::task_tracker::TaskTrackerToken; /// While a reference is kept around, the associated [`Barrier::wait`] will wait. /// diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 5241ab183c..290a5b2686 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,9 +1,7 @@ +use std::borrow::Cow; +use std::fs::{self, File}; +use std::io::{self, Write}; use std::os::fd::AsRawFd; -use std::{ - borrow::Cow, - fs::{self, File}, - io::{self, Write}, -}; use camino::{Utf8Path, Utf8PathBuf}; diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index a1bcec9229..2a85f54a01 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -1,6 +1,7 @@ //! Wrapper around `std::env::var` for parsing environment variables. -use std::{fmt::Display, str::FromStr}; +use std::fmt::Display; +use std::str::FromStr; /// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index fc998ad9a9..ce014eb0ac 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -127,6 +127,9 @@ pub async fn failpoint_sleep_cancellable_helper( tracing::info!("failpoint {:?}: sleep done", name); } +/// Initialize the configured failpoints +/// +/// You must call this function before any concurrent threads do operations. pub fn init() -> fail::FailScenario<'static> { // The failpoints lib provides support for parsing the `FAILPOINTS` env var. // We want non-default behavior for `exit`, though, so, we handle it separately. @@ -134,7 +137,10 @@ pub fn init() -> fail::FailScenario<'static> { // Format for FAILPOINTS is "name=actions" separated by ";". let actions = std::env::var("FAILPOINTS"); if actions.is_ok() { - std::env::remove_var("FAILPOINTS"); + // SAFETY: this function should before any threads start and access env vars concurrently + unsafe { + std::env::remove_var("FAILPOINTS"); + } } else { // let the library handle non-utf8, or nothing for not present } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 8e53d2c79b..a406ab0378 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -58,9 +58,8 @@ where #[cfg(test)] mod test { - use crate::fs_ext::{is_directory_empty, list_dir}; - use super::ignore_absent_files; + use crate::fs_ext::{is_directory_empty, list_dir}; #[test] fn is_empty_dir() { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index 897e30d7f1..fc6f794b57 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -38,7 +38,8 @@ pub fn rename_noreplace( #[cfg(test)] mod test { - use std::{fs, path::PathBuf}; + use std::fs; + use std::path::PathBuf; use super::*; diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 44565ee6a2..b5e4a4644a 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -169,9 +169,9 @@ mod test { ]; let mut s = String::new(); - for (line, gen, expected) in examples { + for (line, gen_, expected) in examples { s.clear(); - write!(s, "{}", &gen.get_suffix()).expect("string grows"); + write!(s, "{}", &gen_.get_suffix()).expect("string grows"); assert_eq!(s, expected, "example on {line}"); } } diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs index cec5202460..26cd640d3b 100644 --- a/libs/utils/src/guard_arc_swap.rs +++ b/libs/utils/src/guard_arc_swap.rs @@ -1,8 +1,9 @@ //! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes //! don't block reads. -use arc_swap::ArcSwap; use std::sync::Arc; + +use arc_swap::ArcSwap; use tokio::sync::TryLockError; pub struct GuardArcSwap { diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index eb91839504..6016c23a01 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,5 +1,6 @@ +use std::fmt; use std::num::ParseIntError; -use std::{fmt, str::FromStr}; +use std::str::FromStr; use anyhow::Context; use hex::FromHex; @@ -215,7 +216,7 @@ macro_rules! id_newtype { impl AsRef<[u8]> for $t { fn as_ref(&self) -> &[u8] { - &self.0 .0 + &self.0.0 } } @@ -367,9 +368,8 @@ impl FromStr for NodeId { mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; - use crate::bin_ser::BeSer; - use super::*; + use crate::bin_ser::BeSer; #[test] fn test_id_serde_non_human_readable() { diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs index 0cc58738c0..2398f92766 100644 --- a/libs/utils/src/leaky_bucket.rs +++ b/libs/utils/src/leaky_bucket.rs @@ -21,15 +21,12 @@ //! //! Another explaination can be found here: -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Mutex, - }, - time::Duration, -}; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Duration; -use tokio::{sync::Notify, time::Instant}; +use tokio::sync::Notify; +use tokio::time::Instant; pub struct LeakyBucketConfig { /// This is the "time cost" of a single request unit. diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs index 5ae0e86af8..766529838c 100644 --- a/libs/utils/src/linux_socket_ioctl.rs +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -2,21 +2,23 @@ //! //! -use std::{ - io, - mem::MaybeUninit, - os::{fd::RawFd, raw::c_int}, -}; +use std::io; +use std::mem::MaybeUninit; +use std::os::fd::RawFd; +use std::os::raw::c_int; use nix::libc::{FIONREAD, TIOCOUTQ}; unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { let mut inq: MaybeUninit = MaybeUninit::uninit(); - let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); - if err == 0 { - Ok(inq.assume_init()) - } else { - Err(io::Error::last_os_error()) + // SAFETY: encapsulating fn is unsafe, we require `socket_fd` to be a valid file descriptor + unsafe { + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } } } @@ -24,12 +26,14 @@ unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn inq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, FIONREAD) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, FIONREAD) } } /// # Safety /// /// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. pub unsafe fn outq(socket_fd: RawFd) -> io::Result { - do_ioctl(socket_fd, TIOCOUTQ) + // SAFETY: encapsulating fn is unsafe + unsafe { do_ioctl(socket_fd, TIOCOUTQ) } } diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 3a2ed3e830..6aeeeca021 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -6,16 +6,15 @@ //! there for potential pitfalls with lock files that are used //! to store PIDs (pidfiles). -use std::{ - fs, - io::{Read, Write}, - ops::Deref, - os::unix::prelude::AsRawFd, -}; +use std::fs; +use std::io::{Read, Write}; +use std::ops::Deref; +use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use nix::{errno::Errno::EAGAIN, fcntl}; +use nix::errno::Errno::EAGAIN; +use nix::fcntl; use crate::crashsafe; diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2c36942f43..881f1e765d 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -273,7 +273,9 @@ fn log_panic_to_stderr( location: Option>, backtrace: &std::backtrace::Backtrace, ) { - eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); + eprintln!( + "panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}" + ); } struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); @@ -361,7 +363,8 @@ pub async fn log_slow(name: &str, threshold: Duration, f: impl Future>, extra_options: &[(&str, &str)], ) -> Option { - let dsn = env::var("SENTRY_DSN").ok()?; + let Ok(dsn) = env::var("SENTRY_DSN") else { + info!("not initializing Sentry, no SENTRY_DSN given"); + return None; + }; let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into()); let guard = sentry::init(( dsn, sentry::ClientOptions { - release: release_name, - environment: Some(environment.into()), + release: release_name.clone(), + environment: Some(environment.clone().into()), ..Default::default() }, )); @@ -25,5 +29,19 @@ pub fn init_sentry( scope.set_extra(key, value.into()); } }); + + if let Some(dsn) = guard.dsn() { + info!( + "initialized Sentry for project {}, environment {}, release {} (using API {})", + dsn.project_id(), + environment, + release_name.unwrap_or(Cow::Borrowed("None")), + dsn.envelope_api_url(), + ); + } else { + // This should panic during sentry::init(), but we may as well cover it. + error!("failed to initialize Sentry, invalid DSN"); + } + Some(guard) } diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index d99dc25769..3c4c7f882d 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -5,6 +5,7 @@ use std::collections::BinaryHeap; use std::mem; use std::sync::Mutex; use std::time::Duration; + use tokio::sync::watch::{self, channel}; use tokio::time::timeout; @@ -248,11 +249,7 @@ where let internal = self.internal.lock().unwrap(); let cnt = internal.current.cnt_value(); drop(internal); - if cnt >= num { - Ok(()) - } else { - Err(cnt) - } + if cnt >= num { Ok(()) } else { Err(cnt) } } /// Register and return a channel that will be notified when a number arrives, @@ -325,9 +322,10 @@ where #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { assert!(*self <= val); diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 36e874a161..ca1e7aa25b 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -12,11 +12,7 @@ pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { pub const fn new(pct: u8) -> Option { - if pct <= 100 { - Some(Percent(pct)) - } else { - None - } + if pct <= 100 { Some(Percent(pct)) } else { None } } pub fn get(&self) -> u8 { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index d98284f969..c8c410a725 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -1,6 +1,7 @@ //! See `pageserver_api::shard` for description on sharding. -use std::{ops::RangeInclusive, str::FromStr}; +use std::ops::RangeInclusive; +use std::str::FromStr; use hex::FromHex; use serde::{Deserialize, Serialize}; @@ -59,11 +60,7 @@ impl ShardCount { /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). pub fn count(&self) -> u8 { - if self.0 > 0 { - self.0 - } else { - 1 - } + if self.0 > 0 { self.0 } else { 1 } } /// The literal internal value: this is **not** the number of shards in the diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index c37e9aea58..f2be1957c4 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,7 +1,7 @@ +pub use signal_hook::consts::TERM_SIGNALS; +pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; -pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; - pub enum Signal { Quit, Interrupt, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 6700f86e4a..fabdf9df46 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -44,8 +44,7 @@ #![warn(missing_docs)] use std::ops::Deref; -use std::sync::{Arc, Weak}; -use std::sync::{RwLock, RwLockWriteGuard}; +use std::sync::{Arc, RwLock, RwLockWriteGuard, Weak}; use tokio::sync::watch; @@ -219,10 +218,11 @@ impl RcuWaitList { #[cfg(test)] mod tests { - use super::*; use std::sync::Mutex; use std::time::Duration; + use super::*; + #[tokio::test] async fn two_writers() { let rcu = Rcu::new(1); diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 0a1ed81621..93460785bf 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 66c2065554..8f8401b35d 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -1,7 +1,6 @@ -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, MutexGuard, -}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; + use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of @@ -301,14 +300,13 @@ impl Drop for InitPermit { #[cfg(test)] mod tests { + use std::convert::Infallible; + use std::pin::{Pin, pin}; + use std::time::Duration; + use futures::Future; use super::*; - use std::{ - convert::Infallible, - pin::{pin, Pin}, - time::Duration, - }; #[tokio::test] async fn many_initializers() { diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index 0cab291d51..7dfbf40411 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -1,4 +1,5 @@ -use core::{future::poll_fn, task::Poll}; +use core::future::poll_fn; +use core::task::Poll; use std::sync::{Arc, Mutex}; use diatomic_waker::DiatomicWaker; diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs index 6b35d3d63a..6a4a77127d 100644 --- a/libs/utils/src/tcp_listener.rs +++ b/libs/utils/src/tcp_listener.rs @@ -1,9 +1,8 @@ -use std::{ - io, - net::{TcpListener, ToSocketAddrs}, -}; +use std::io; +use std::net::{TcpListener, ToSocketAddrs}; -use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; +use nix::sys::socket::setsockopt; +use nix::sys::socket::sockopt::ReuseAddr; /// Bind a [`TcpListener`] to addr with `SO_REUSEADDR` set to true. pub fn bind(addr: A) -> io::Result { diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index add2fa7920..3d15e08400 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -172,16 +172,14 @@ fn tracing_subscriber_configured() -> bool { #[cfg(test)] mod tests { + use std::collections::HashSet; + use std::fmt::{self}; + use std::hash::{Hash, Hasher}; + use tracing_subscriber::prelude::*; use super::*; - use std::{ - collections::HashSet, - fmt::{self}, - hash::{Hash, Hasher}, - }; - struct MemoryIdentity<'a>(&'a dyn Extractor); impl MemoryIdentity<'_> { diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs index 6b53ab1316..30540c27d0 100644 --- a/libs/utils/src/try_rcu.rs +++ b/libs/utils/src/try_rcu.rs @@ -44,10 +44,12 @@ where #[cfg(test)] mod tests { - use super::*; - use arc_swap::ArcSwap; use std::sync::Arc; + use arc_swap::ArcSwap; + + use super::*; + #[test] fn test_try_rcu_success() { let swap = ArcSwap::from(Arc::new(42)); diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 1fe048c6f0..eded86af3e 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,4 +1,6 @@ -use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +use std::alloc::Layout; +use std::cmp::Ordering; +use std::ops::RangeBounds; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VecMapOrdering { @@ -214,7 +216,8 @@ fn extract_key(entry: &(K, V)) -> &K { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, ops::Bound}; + use std::collections::BTreeMap; + use std::ops::Bound; use super::{VecMap, VecMapOrdering}; diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs index be2dcc00f5..96c2a83951 100644 --- a/libs/utils/src/zstd.rs +++ b/libs/utils/src/zstd.rs @@ -1,19 +1,14 @@ use std::io::SeekFrom; use anyhow::{Context, Result}; -use async_compression::{ - tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, - zstd::CParameter, - Level, -}; +use async_compression::Level; +use async_compression::tokio::bufread::ZstdDecoder; +use async_compression::tokio::write::ZstdEncoder; +use async_compression::zstd::CParameter; use camino::Utf8Path; use nix::NixPath; -use tokio::{ - fs::{File, OpenOptions}, - io::AsyncBufRead, - io::AsyncSeekExt, - io::AsyncWriteExt, -}; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{AsyncBufRead, AsyncSeekExt, AsyncWriteExt}; use tokio_tar::{Archive, Builder, HeaderMode}; use walkdir::WalkDir; diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index b995b61b78..e0c8cdde00 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -1,7 +1,8 @@ +use std::io::Read; + use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; -use std::io::Read; use utils::bin_ser::LeSer; #[derive(Debug, PartialEq, Eq, Deserialize)] diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index 846904cf87..ed6ba4d267 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -1,23 +1,25 @@ -use anyhow::Context; -use criterion::{criterion_group, criterion_main, Criterion}; -use futures::{stream::FuturesUnordered, StreamExt}; -use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; -use pprof::criterion::{Output, PProfProfiler}; -use serde::Deserialize; -use std::{env, num::NonZeroUsize, sync::Arc}; +use std::env; +use std::num::NonZeroUsize; +use std::sync::Arc; +use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::Utf8TempDir; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, S3Config, }; +use serde::Deserialize; use tokio_util::sync::CancellationToken; -use utils::{ - lsn::Lsn, - shard::{ShardCount, ShardNumber}, -}; +use utils::lsn::Lsn; +use utils::shard::{ShardCount, ShardNumber}; use wal_decoder::models::InterpretedWalRecord; const S3_BUCKET: &str = "neon-github-public-dev"; @@ -31,7 +33,7 @@ const METADATA_FILENAME: &str = "metadata.json"; static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] +#[unsafe(export_name = "malloc_conf")] pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; async fn create_s3_client() -> anyhow::Result> { diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index ebb38ceb52..cb0835e894 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -3,8 +3,6 @@ use std::collections::HashMap; -use crate::models::*; -use crate::serialized_batch::SerializedValueBatch; use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -14,6 +12,9 @@ use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::walrecord::*; use utils::lsn::Lsn; +use crate::models::*; +use crate::serialized_batch::SerializedValueBatch; + impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. /// Data blocks which do not match any of the provided shard identities are filtered out. diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index d76f75f51f..b451d6d8e0 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -8,20 +8,18 @@ use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{CompactKey, Key, rel_block_to_key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; -use pageserver_api::{key::CompactKey, value::Value}; +use pageserver_api::value::Value; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; -use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ}; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use pageserver_api::key::Key; - use crate::models::InterpretedWalRecord; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); @@ -515,10 +513,11 @@ impl SerializedValueBatch { let empty = self.raw.is_empty(); if cfg!(debug_assertions) && empty { - assert!(self - .metadata - .iter() - .all(|meta| matches!(meta, ValueMeta::Observed(_)))); + assert!( + self.metadata + .iter() + .all(|meta| matches!(meta, ValueMeta::Observed(_))) + ); } !empty diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 52ed5c70b5..5a28128dd8 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -7,15 +7,12 @@ use utils::lsn::Lsn; use utils::postgres_client::{Compression, InterpretedFormat}; use crate::models::{ - FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, + FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord, proto, }; - use crate::serialized_batch::{ ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta, }; -use crate::models::proto; - #[derive(Debug, thiserror::Error)] pub enum ToWireFormatError { #[error("{0}")] @@ -83,8 +80,8 @@ impl ToWireFormat for InterpretedWalRecords { format: InterpretedFormat, compression: Option, ) -> Result { - use async_compression::tokio::write::ZstdEncoder; use async_compression::Level; + use async_compression::tokio::write::ZstdEncoder; let encode_res: Result = match format { InterpretedFormat::Bincode => { diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 8d5b1ade35..530ceb1327 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -1,9 +1,11 @@ //! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h //! to generate Rust bindings for it. -use std::{env, path::PathBuf, process::Command}; +use std::env; +use std::path::PathBuf; +use std::process::Command; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; const WALPROPOSER_PG_VERSION: &str = "v17"; diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 2fbea3fe45..d660602149 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -3,27 +3,14 @@ #![allow(dead_code)] -use std::ffi::CStr; -use std::ffi::CString; +use std::ffi::{CStr, CString}; -use crate::bindings::uint32; -use crate::bindings::walproposer_api; -use crate::bindings::NeonWALReadResult; -use crate::bindings::PGAsyncReadResult; -use crate::bindings::PGAsyncWriteResult; -use crate::bindings::Safekeeper; -use crate::bindings::Size; -use crate::bindings::StringInfoData; -use crate::bindings::TimestampTz; -use crate::bindings::WalProposer; -use crate::bindings::WalProposerConnStatusType; -use crate::bindings::WalProposerConnectPollStatusType; -use crate::bindings::WalProposerExecStatusType; -use crate::bindings::WalproposerShmemState; -use crate::bindings::XLogRecPtr; -use crate::walproposer::ApiImpl; -use crate::walproposer::StreamingCallback; -use crate::walproposer::WaitResult; +use crate::bindings::{ + NeonWALReadResult, PGAsyncReadResult, PGAsyncWriteResult, Safekeeper, Size, StringInfoData, + TimestampTz, WalProposer, WalProposerConnStatusType, WalProposerConnectPollStatusType, + WalProposerExecStatusType, WalproposerShmemState, XLogRecPtr, uint32, walproposer_api, +}; +use crate::walproposer::{ApiImpl, StreamingCallback, WaitResult}; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { unsafe { diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 60b606c64a..4e50c21fca 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -2,15 +2,15 @@ use std::ffi::CString; -use crate::{ - api_bindings::{create_api, take_vec_u8, Level}, - bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, - WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, - }, -}; use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use crate::api_bindings::{Level, create_api, take_vec_u8}; +use crate::bindings::{ + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, +}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. @@ -275,22 +275,17 @@ impl StreamingCallback { #[cfg(test)] mod tests { use core::panic; - use std::{ - cell::Cell, - ffi::CString, - sync::{atomic::AtomicUsize, mpsc::sync_channel}, - }; + use std::cell::{Cell, UnsafeCell}; + use std::ffi::CString; + use std::sync::atomic::AtomicUsize; + use std::sync::mpsc::sync_channel; - use std::cell::UnsafeCell; use utils::id::TenantTimelineId; - use crate::{ - api_bindings::Level, - bindings::{NeonWALReadResult, PG_VERSION_NUM}, - walproposer::Wrapper, - }; - use super::ApiImpl; + use crate::api_bindings::Level; + use crate::bindings::{NeonWALReadResult, PG_VERSION_NUM}; + use crate::walproposer::Wrapper; #[derive(Clone, Copy, Debug)] struct WaitEventsData { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7330856be4..a372be5044 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,6 +48,9 @@ pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true +rustls-pemfile.workspace = true +rustls-pki-types.workspace = true +rustls.workspace = true scopeguard.workspace = true send-future.workspace = true serde.workspace = true @@ -62,6 +65,7 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util" tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-rustls.workspace = true tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } @@ -98,6 +102,7 @@ criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } indoc.workspace = true +uuid.workspace = true [[bench]] name = "bench_layer_map" @@ -115,6 +120,10 @@ harness = false name = "upload_queue" harness = false +[[bench]] +name = "bench_metrics" +harness = false + [[bin]] name = "test_helper_slow_client_reads" required-features = [ "testing" ] diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index e11af49449..e1444778b8 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -7,7 +7,6 @@ use std::time::Instant; use criterion::measurement::WallTime; use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; use pageserver_api::key::Key; @@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { .collect() } -// Construct a partitioning for testing get_difficulty map when we -// don't have an exact result of `collect_keyspace` to work with. -fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { - let mut parts = Vec::new(); - - // We add a partition boundary at the start of each image layer, - // no matter what lsn range it covers. This is just the easiest - // thing to do. A better thing to do would be to get a real - // partitioning from some database. Even better, remove the need - // for key partitions by deciding where to create image layers - // directly based on a coverage-based difficulty map. - let mut keys: Vec<_> = layer_map - .iter_historic_layers() - .filter_map(|l| { - if l.is_incremental() { - None - } else { - let kr = l.get_key_range(); - Some(kr.start.next()) - } - }) - .collect(); - keys.sort(); - - let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap(); - for key in keys { - parts.push(KeySpace { - ranges: vec![current_key..key], - }); - current_key = key; - } - - KeyPartitioning { parts } -} - // Benchmark using metadata extracted from our performance test environment, from // a project where we have run pgbench many timmes. The pgbench database was initialized // between each test run. @@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) { // Choose uniformly distributed queries let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map); - // Choose inputs for get_difficulty_map - let latest_lsn = layer_map - .iter_historic_layers() - .map(|l| l.get_lsn_range().end) - .max() - .unwrap(); - let partitioning = uniform_key_partitioning(&layer_map, latest_lsn); - - // Check correctness of get_difficulty_map - // TODO put this in a dedicated test outside of this mod - { - println!("running correctness check"); - - let now = Instant::now(); - let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning); - assert!(result_bruteforce.len() == partitioning.parts.len()); - println!("Finished bruteforce in {:?}", now.elapsed()); - - let now = Instant::now(); - let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None); - assert!(result_fast.len() == partitioning.parts.len()); - println!("Finished fast in {:?}", now.elapsed()); - - // Assert results are equal. Manually iterate for easier debugging. - let zip = std::iter::zip( - &partitioning.parts, - std::iter::zip(result_bruteforce, result_fast), - ); - for (_part, (bruteforce, fast)) in zip { - assert_eq!(bruteforce, fast); - } - - println!("No issues found"); - } - // Define and name the benchmark function let mut group = c.benchmark_group("real_map"); group.bench_function("uniform_queries", |b| { @@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) { } }); }); - group.bench_function("get_difficulty_map", |b| { - b.iter(|| { - layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3)); - }); - }); group.finish(); } diff --git a/pageserver/benches/bench_metrics.rs b/pageserver/benches/bench_metrics.rs new file mode 100644 index 0000000000..38025124e1 --- /dev/null +++ b/pageserver/benches/bench_metrics.rs @@ -0,0 +1,366 @@ +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use utils::id::{TenantId, TimelineId}; + +// +// Demonstrates that repeat label values lookup is a multicore scalability bottleneck +// that is worth avoiding. +// +criterion_group!( + label_values, + label_values::bench_naive_usage, + label_values::bench_cache_label_values_lookup +); +mod label_values { + use super::*; + + pub fn bench_naive_usage(c: &mut Criterion) { + let mut g = c.benchmark_group("label_values__naive_usage"); + + for ntimelines in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("ntimelines", ntimelines), + &ntimelines, + |b, ntimelines| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*ntimelines + 1); + + let timelines = (0..*ntimelines) + .map(|_| { + ( + TenantId::generate().to_string(), + "0000".to_string(), + TimelineId::generate().to_string(), + ) + }) + .collect::>(); + + let metric_vec = metrics::UIntGaugeVec::new( + metrics::opts!("testmetric", "testhelp"), + &["tenant_id", "shard_id", "timeline_id"], + ) + .unwrap(); + + std::thread::scope(|s| { + for (tenant_id, shard_id, timeline_id) in &timelines { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + metric_vec + .with_label_values(&[tenant_id, shard_id, timeline_id]) + .inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } + + pub fn bench_cache_label_values_lookup(c: &mut Criterion) { + let mut g = c.benchmark_group("label_values__cache_label_values_lookup"); + + for ntimelines in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("ntimelines", ntimelines), + &ntimelines, + |b, ntimelines| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*ntimelines + 1); + + let timelines = (0..*ntimelines) + .map(|_| { + ( + TenantId::generate().to_string(), + "0000".to_string(), + TimelineId::generate().to_string(), + ) + }) + .collect::>(); + + let metric_vec = metrics::UIntGaugeVec::new( + metrics::opts!("testmetric", "testhelp"), + &["tenant_id", "shard_id", "timeline_id"], + ) + .unwrap(); + + std::thread::scope(|s| { + for (tenant_id, shard_id, timeline_id) in &timelines { + s.spawn(|| { + let metric = metric_vec.with_label_values(&[ + tenant_id, + shard_id, + timeline_id, + ]); + barrier.wait(); + for _ in 0..iters { + metric.inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } +} + +// +// Demonstrates that even a single metric can be a scalability bottleneck +// if multiple threads in it concurrently but there's nothing we can do +// about it without changing the metrics framework to use e.g. sharded counte atomics. +// +criterion_group!( + single_metric_multicore_scalability, + single_metric_multicore_scalability::bench, +); +mod single_metric_multicore_scalability { + use super::*; + + pub fn bench(c: &mut Criterion) { + let mut g = c.benchmark_group("single_metric_multicore_scalability"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + metric.inc(); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } +} + +// +// Demonstrates that even if we cache label value, the propagation of such a cached metric value +// by Clone'ing it is a scalability bottleneck. +// The reason is that it's an Arc internally and thus there's contention on the reference count atomics. +// +// We can avoid that by having long-lived references per thread (= indirection). +// +criterion_group!( + propagation_of_cached_label_value, + propagation_of_cached_label_value::bench_naive, + propagation_of_cached_label_value::bench_long_lived_reference_per_thread, +); +mod propagation_of_cached_label_value { + use std::sync::Arc; + + use super::*; + + pub fn bench_naive(c: &mut Criterion) { + let mut g = c.benchmark_group("propagation_of_cached_label_value__naive"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + barrier.wait(); + for _ in 0..iters { + // propagating the metric means we'd clone it into the child RequestContext + let propagated = metric.clone(); + // simulate some work + criterion::black_box(propagated); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + g.finish(); + } + + pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) { + let mut g = + c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread"); + + for nthreads in [1, 4, 8] { + g.bench_with_input( + BenchmarkId::new("nthreads", nthreads), + &nthreads, + |b, nthreads| { + b.iter_custom(|iters| { + let barrier = std::sync::Barrier::new(*nthreads + 1); + + let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap(); + + std::thread::scope(|s| { + for _ in 0..*nthreads { + s.spawn(|| { + // This is the technique. + let this_threads_metric_reference = Arc::new(metric.clone()); + + barrier.wait(); + for _ in 0..iters { + // propagating the metric means we'd clone it into the child RequestContext + let propagated = Arc::clone(&this_threads_metric_reference); + // simulate some work (include the pointer chase!) + criterion::black_box(&*propagated); + } + barrier.wait(); + }); + } + barrier.wait(); + let start = std::time::Instant::now(); + barrier.wait(); + start.elapsed() + }) + }) + }, + ); + } + } +} + +criterion_main!( + label_values, + single_metric_multicore_scalability, + propagation_of_cached_label_value +); + +/* +RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot + +Results on an im4gn.2xlarge instance + +label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns] +label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns] +label_values__naive_usage/ntimelines/8 time: [1.1082 ยตs 1.1109 ยตs 1.1135 ยตs] +label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns] +single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns] +single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns] +single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns] + +Results on an i3en.3xlarge instance + +label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns] +label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns] +label_values__naive_usage/ntimelines/8 time: [1.4513 ยตs 1.4596 ยตs 1.4665 ยตs] +label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns] +single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns] +single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns] +single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns] + +Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz + +label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns] +label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns] +label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns] +label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns] +single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns] +single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns] +single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns + +Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency) + +label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns] +label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns] +label_values__naive_usage/ntimelines/8 time: [1.1615 ยตs 1.1998 ยตs 1.2399 ยตs] +label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns] +single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns] +single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns] +single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns] + +Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor + +label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns] +label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns] +label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns] +label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns] +label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns] +label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns] +single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns] +single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns] +single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns] +propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns] +propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns] +propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns] +propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns] + +*/ diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index bb0f64ca32..830fd8a531 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,17 +1,15 @@ -use std::{collections::HashMap, error::Error as _}; +use std::collections::HashMap; +use std::error::Error as _; use bytes::Bytes; -use reqwest::{IntoUrl, Method, StatusCode}; - use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; -use pageserver_api::{models::*, shard::TenantShardId}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - +use pageserver_api::models::*; +use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; +use reqwest::{Certificate, IntoUrl, Method, StatusCode}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::BlockUnblock; @@ -40,6 +38,9 @@ pub enum Error { #[error("Cancelled")] Cancelled, + + #[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] + CreateClient(reqwest::Error), } pub type Result = std::result::Result; @@ -71,8 +72,17 @@ pub enum ForceAwaitLogicalSize { } impl Client { - pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { - Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + pub fn new( + mgmt_api_endpoint: String, + jwt: Option<&str>, + ssl_ca_cert: Option, + ) -> Result { + let mut http_client = reqwest::Client::builder(); + if let Some(ssl_ca_cert) = ssl_ca_cert { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build().map_err(Error::CreateClient)?; + Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt)) } pub fn from_client( @@ -103,12 +113,10 @@ impl Client { debug_assert!(path.starts_with('/')); let uri = format!("{}{}", self.mgmt_api_endpoint, path); - let req = self.client.request(Method::GET, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value) - } else { - req - }; + let mut req = self.client.request(Method::GET, uri); + if let Some(value) = &self.authorization_header { + req = req.header(reqwest::header::AUTHORIZATION, value); + } req.send().await.map_err(Error::ReceiveBody) } @@ -482,6 +490,7 @@ impl Client { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", @@ -489,6 +498,9 @@ impl Client { )) .expect("Cannot build URL"); + path.query_pairs_mut() + .append_pair("recurse", &format!("{}", recurse)); + if let Some(concurrency) = concurrency { path.query_pairs_mut() .append_pair("concurrency", &format!("{}", concurrency)); diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 47da83b0eb..ef35ac2f48 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,23 +1,16 @@ use std::sync::{Arc, Mutex}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; -use pageserver_api::{ - models::{ - PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, - PagestreamGetPageResponse, - }, - reltag::RelTag, +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; +use pageserver_api::models::{ + PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, }; +use pageserver_api::reltag::RelTag; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; use tokio_util::sync::CancellationToken; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; pub struct Client { client: tokio_postgres::Client, diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs index c308694ae1..dd35417333 100644 --- a/pageserver/compaction/src/bin/compaction-simulator.rs +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -1,11 +1,11 @@ -use clap::{Parser, Subcommand}; -use pageserver_compaction::helpers::PAGE_SZ; -use pageserver_compaction::simulator::MockTimeline; -use rand::Rng; use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::OnceLock; +use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -157,8 +157,9 @@ async fn run_suite() -> anyhow::Result<()> { use std::fs::File; use std::io::Stdout; use std::sync::Mutex; -use tracing_subscriber::fmt::writer::EitherWriter; + use tracing_subscriber::fmt::MakeWriter; +use tracing_subscriber::fmt::writer::EitherWriter; static LOG_FILE: OnceLock>> = OnceLock::new(); fn get_log_output() -> &'static Mutex> { diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 7779ffaf8b..75f43d7ff7 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -17,20 +17,19 @@ //! distance of image layers in LSN dimension is roughly equal to the logical //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. -use futures::StreamExt; -use pageserver_api::shard::ShardIdentity; -use tracing::{debug, info}; - use std::collections::{HashSet, VecDeque}; use std::ops::Range; -use crate::helpers::{ - accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, -}; -use crate::interface::*; +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use tracing::{debug, info}; use utils::lsn::Lsn; +use crate::helpers::{ + PAGE_SZ, accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, +}; use crate::identify_levels::identify_level; +use crate::interface::*; /// Main entry point to compaction. /// @@ -307,7 +306,7 @@ where let mut layer_ids: Vec = Vec::new(); for layer_id in &job.input_layers { let layer = &self.layers[layer_id.0].layer; - if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(layer, ctx).await? { deltas.push(dl.clone()); layer_ids.push(*layer_id); } @@ -536,15 +535,16 @@ where let mut deltas: Vec = Vec::new(); for layer_id in &job.input_layers { let l = &self.layers[layer_id.0]; - if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer, ctx).await? { deltas.push(dl.clone()); } } // Open stream - let key_value_stream = - std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + let key_value_stream = std::pin::pin!( + merge_delta_keys_buffered::(deltas.as_slice(), ctx) .await? - .map(Result::<_, anyhow::Error>::Ok)); + .map(Result::<_, anyhow::Error>::Ok) + ); let mut new_jobs = Vec::new(); // Slide a window through the keyspace diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 7e4e3042b3..421802eef3 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -1,21 +1,21 @@ //! This file contains generic utility functions over the interface types, //! which could be handy for any compaction implementation. -use crate::interface::*; +use std::collections::{BinaryHeap, VecDeque}; +use std::fmt::Display; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{Poll, ready}; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; -use std::collections::BinaryHeap; -use std::collections::VecDeque; -use std::fmt::Display; -use std::future::Future; -use std::ops::{DerefMut, Range}; -use std::pin::Pin; -use std::task::{ready, Poll}; use utils::lsn::Lsn; +use crate::interface::*; + pub const PAGE_SZ: u64 = 8192; pub fn keyspace_total_size( diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index e04bd15396..61575e3992 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -26,15 +26,15 @@ //! file size, the file will still be considered to be part of L0 at the next //! iteration. -use anyhow::bail; use std::collections::BTreeSet; use std::ops::Range; + +use anyhow::bail; +use tracing::{info, trace}; use utils::lsn::Lsn; use crate::interface::*; -use tracing::{info, trace}; - pub struct Level { pub lsn_range: Range, pub layers: Vec, @@ -60,7 +60,11 @@ where if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { // shouldn't happen. Indicates that the caller passed a bogus // end_lsn. - bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + bail!( + "identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", + end_lsn, + l.short_id() + ); } // include image layers sitting exacty at `end_lsn`. let is_image = !l.is_delta(); @@ -246,9 +250,10 @@ impl Level { #[cfg(test)] mod tests { + use std::sync::{Arc, Mutex}; + use super::*; use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; - use std::sync::{Arc, Mutex}; fn delta(key_range: Range, lsn_range: Range) -> MockLayer { MockLayer::Delta(Arc::new(MockDeltaLayer { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 8ed393a645..63fbc565cc 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,9 +3,12 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use futures::Future; -use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; use std::ops::Range; + +use futures::Future; +use pageserver_api::key::Key; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::ShardIdentity; use utils::lsn::Lsn; /// Public interface. This is the main thing that the implementor needs to provide @@ -55,6 +58,7 @@ pub trait CompactionJobExecutor { fn downcast_delta_layer( &self, layer: &Self::Layer, + ctx: &Self::RequestContext, ) -> impl Future>> + Send; // ---- diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 673b80c313..bf9f6f2658 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -1,22 +1,17 @@ mod draw; -use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::{Arc, Mutex}; +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use futures::StreamExt; use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; - use utils::lsn::Lsn; -use std::fmt::Write; -use std::ops::Range; -use std::sync::Arc; -use std::sync::Mutex; - -use crate::helpers::PAGE_SZ; -use crate::helpers::{merge_delta_keys, overlaps_with}; - +use crate::helpers::{PAGE_SZ, merge_delta_keys, overlaps_with}; use crate::interface; use crate::interface::CompactionLayer; @@ -487,6 +482,7 @@ impl interface::CompactionJobExecutor for MockTimeline { async fn downcast_delta_layer( &self, layer: &MockLayer, + _ctx: &MockRequestContext, ) -> anyhow::Result>> { Ok(match layer { MockLayer::Delta(l) => Some(l.clone()), diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 4559db09f1..3d35d1b91e 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -1,14 +1,14 @@ -use super::Key; -use anyhow::Result; use std::cmp::Ordering; -use std::{ - collections::{BTreeMap, BTreeSet, HashSet}, - fmt::Write, - ops::Range, -}; -use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::fmt::Write; +use std::ops::Range; + +use anyhow::Result; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, Style, rgb}; use utils::lsn::Lsn; +use super::Key; + // Map values to their compressed coordinate - the index the value // would have in a sorted and deduplicated list of all values. struct CoordinateMap { diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 177e65ef79..80ca414543 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -50,18 +50,18 @@ //! ``` //! -use anyhow::{Context, Result}; -use pageserver_api::key::Key; use std::cmp::Ordering; +use std::collections::{BTreeMap, BTreeSet}; use std::io::{self, BufRead}; +use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; -use std::{ - collections::{BTreeMap, BTreeSet}, - ops::Range, -}; -use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke}; -use utils::{lsn::Lsn, project_git_version}; + +use anyhow::{Context, Result}; +use pageserver_api::key::Key; +use svg_fmt::{BeginSvg, EndSvg, Fill, Stroke, rectangle, rgb}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index c7f0719c41..600f7c412e 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -1,11 +1,10 @@ +use std::str::FromStr; + use anyhow::Context; use clap::Parser; -use pageserver_api::{ - key::Key, - reltag::{BlockNumber, RelTag, SlruKind}, - shard::{ShardCount, ShardStripeSize}, -}; -use std::str::FromStr; +use pageserver_api::key::Key; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::{ShardCount, ShardStripeSize}; #[derive(Parser)] pub(super) struct DescribeKeyCommand { @@ -394,7 +393,10 @@ mod tests { fn single_positional_spanalike_is_key_material() { // why is this needed? if you are checking many, then copypaste starts to appeal let strings = [ - (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + ( + line!(), + "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0", + ), (line!(), "rel=1663/208101/2620_fsm blkno=2"), (line!(), "rel=1663/208101/2620.1 blkno=2"), ]; @@ -420,7 +422,15 @@ mod tests { #[test] fn multiple_spanlike_args() { let strings = [ - (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + ( + line!(), + &[ + "process_query{tenant_id=C", + "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", + "blkno=2", + "req_lsn=0/238D98C8}", + ][..], + ), (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), (line!(), &["1663/208101/2620_fsm", "2"][..]), ]; diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 2c350d6d86..b426f977cf 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -2,27 +2,27 @@ //! //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. -use anyhow::{anyhow, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use pageserver::context::{DownloadBehavior, RequestContext}; -use pageserver::task_mgr::TaskKind; -use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use pageserver::virtual_file::api::IoMode; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; use std::str::FromStr; use std::{fs, str}; +use anyhow::{Result, anyhow}; +use camino::{Utf8Path, Utf8PathBuf}; +use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::page_cache::{self, PAGE_SZ}; +use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; -use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; -use pageserver::tenant::storage_layer::{range_overlaps, LayerName}; +use pageserver::tenant::storage_layer::delta_layer::{DELTA_KEY_SIZE, Summary}; +use pageserver::tenant::storage_layer::{LayerName, range_overlaps}; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use pageserver::virtual_file::{self, VirtualFile}; -use pageserver_api::key::{Key, KEY_SIZE}; - -use utils::{bin_ser::BeSer, lsn::Lsn}; +use pageserver_api::key::{KEY_SIZE, Key}; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; use crate::AnalyzeLayerMapCmd; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 4c2c3ab30e..05fb35ff09 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -1,3 +1,4 @@ +use std::fs::{self, File}; use std::path::{Path, PathBuf}; use anyhow::Result; @@ -5,12 +6,10 @@ use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; -use pageserver::tenant::storage_layer::{delta_layer, image_layer}; -use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; +use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, image_layer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; -use std::fs::{self, File}; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 353b4bd2f9..72a120a69b 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -11,33 +11,29 @@ mod layer_map_analyzer; mod layers; mod page_trace; -use page_trace::PageTraceCmd; -use std::{ - str::FromStr, - time::{Duration, SystemTime}, -}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; use layers::LayerCmd; -use pageserver::{ - context::{DownloadBehavior, RequestContext}, - page_cache, - task_mgr::TaskKind, - tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file::{self, api::IoMode}, -}; +use page_trace::PageTraceCmd; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::page_cache; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::dump_layerfile_from_path; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::virtual_file::api::IoMode; +use pageserver::virtual_file::{self}; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; -use utils::{ - id::TimelineId, - logging::{self, LogFormat, TracingErrorLayerEnablement}, - lsn::Lsn, - project_git_version, -}; +use utils::id::TimelineId; +use utils::logging::{self, LogFormat, TracingErrorLayerEnablement}; +use utils::lsn::Lsn; +use utils::project_git_version; project_git_version!(GIT_VERSION); @@ -355,7 +351,9 @@ mod tests { assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); - assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_valid( + "pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683", + ); assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); } } diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 245d293e4f..5b5ed09a2b 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -15,6 +15,7 @@ hdrhistogram.workspace = true humantime.workspace = true humantime-serde.workspace = true rand.workspace = true +reqwest.workspace=true serde.workspace = true serde_json.workspace = true tracing.workspace = true diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index b869a0c6c7..394a954c30 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -1,12 +1,12 @@ +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + use pageserver_api::models::{TenantConfig, TenantConfigRequest}; use pageserver_api::shard::TenantShardId; use utils::id::TenantTimelineId; use utils::lsn::Lsn; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Instant; - /// Ingest aux files into the pageserver. #[derive(clap::Parser)] pub(crate) struct Args { @@ -36,7 +36,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 3ae6d99aa7..d3013ded70 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,16 +1,3 @@ -use anyhow::Context; -use pageserver_api::shard::TenantShardId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; -use pageserver_client::page_service::BasebackupRequest; - -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::sync::Barrier; -use tokio::task::JoinSet; -use tracing::{info, instrument}; - use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; @@ -18,6 +5,17 @@ use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; +use anyhow::Context; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; +use pageserver_client::page_service::BasebackupRequest; +use rand::prelude::*; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use tracing::{info, instrument}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -79,7 +77,8 @@ async fn main_impl( let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a60efc7567..969cf24b93 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,18 +1,3 @@ -use anyhow::Context; -use camino::Utf8PathBuf; -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; - -use pageserver_api::shard::TenantShardId; -use tokio_util::sync::CancellationToken; -use utils::id::TenantTimelineId; -use utils::lsn::Lsn; - -use rand::prelude::*; -use tokio::task::JoinSet; -use tracing::info; - use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; @@ -21,6 +6,19 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; +use anyhow::Context; +use camino::Utf8PathBuf; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::shard::TenantShardId; +use rand::prelude::*; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -127,7 +125,8 @@ async fn main_impl( let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs index 1bb71b9353..a77d3000cc 100644 --- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -1,23 +1,19 @@ -use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; +use std::f64; +use std::num::NonZeroUsize; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, Instant}; +use pageserver_api::models::HistoricLayerInfo; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use rand::seq::SliceRandom; +use tokio::sync::{OwnedSemaphorePermit, mpsc}; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use utils::id::{TenantTimelineId, TimelineId}; -use std::{f64, sync::Arc}; -use tokio::{ - sync::{mpsc, OwnedSemaphorePermit}, - task::JoinSet, -}; - -use std::{ - num::NonZeroUsize, - sync::atomic::{AtomicU64, Ordering}, - time::{Duration, Instant}, -}; - /// Evict & on-demand download random layers. #[derive(clap::Parser)] pub(crate) struct Args { @@ -87,7 +83,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); if let Some(engine_str) = &args.set_io_engine { mgmt_api_client.put_io_engine(engine_str).await?; diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index f07beeecfd..2f919ec652 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -2,11 +2,10 @@ use std::sync::Arc; use humantime::Duration; use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use tokio::task::JoinSet; use utils::id::TenantTimelineId; -use pageserver_client::mgmt_api::ForceAwaitLogicalSize; - #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] @@ -41,7 +40,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( args.mgmt_api_endpoint.clone(), args.pageserver_jwt.as_deref(), - )); + None, // TODO: support ssl_ca_file for https APIs in pagebench. + )?); // discover targets let timelines: Vec = crate::util::cli::targets::discover( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index ce54bd9c1c..de527e307b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -33,8 +33,9 @@ use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; -use crate::tenant::Timeline; use crate::tenant::storage_layer::IoConcurrency; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { @@ -42,6 +43,26 @@ pub enum BasebackupError { Server(#[from] anyhow::Error), #[error("basebackup client error {0:#} when {1}")] Client(#[source] io::Error, &'static str), + #[error("basebackup during shutdown")] + Shutdown, +} + +impl From for BasebackupError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } +} + +impl From for BasebackupError { + fn from(value: GetVectoredError) -> Self { + match value { + GetVectoredError::Cancelled => BasebackupError::Shutdown, + err => BasebackupError::Server(err.into()), + } + } } /// Create basebackup with non-rel data in it. @@ -127,7 +148,7 @@ where timeline .gate .enter() - .map_err(|e| BasebackupError::Server(e.into()))?, + .map_err(|_| BasebackupError::Shutdown)?, ), }; basebackup @@ -323,8 +344,7 @@ where let slru_partitions = self .timeline .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .partition( self.timeline.get_shard_identity(), Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, @@ -336,11 +356,10 @@ where let blocks = self .timeline .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for (key, block) in blocks { - let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + let block = block?; slru_builder.add_block(&key, block).await?; } } @@ -349,11 +368,8 @@ where let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self - .timeline - .list_dbdirs(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + for ((spcnode, dbnode), has_relmap_file) in + self.timeline.list_dbdirs(self.lsn, self.ctx).await? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -362,8 +378,7 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -391,8 +406,7 @@ where let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let aux_scan_time = start_time.elapsed(); let aux_estimated_size = aux_files .values() @@ -451,16 +465,14 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? { self.add_twophase_file(xid).await?; } let repl_origins = self .timeline .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let n_origins = repl_origins.len(); if n_origins != 0 { // @@ -505,8 +517,7 @@ where let nblocks = self .timeline .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; // If the relation is empty, create an empty file if nblocks == 0 { @@ -532,8 +543,7 @@ where // TODO: investigate using get_vectored for the entire startblk..endblk range. // But this code path is not on the critical path for most basebackups (?). .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; segment_data.extend_from_slice(&img[..]); } @@ -567,8 +577,7 @@ where let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; if img.len() != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) @@ -622,8 +631,7 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? + .await? .is_empty() { return Ok(()); @@ -674,8 +682,7 @@ where let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))?; + .await?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ab8d37df2e..c4af0d5d41 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -14,6 +14,7 @@ use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; +use nix::sys::socket::{setsockopt, sockopt}; use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; @@ -24,11 +25,12 @@ use pageserver::task_mgr::{ }; use pageserver::tenant::{TenantSharedResources, mgr, secondary}; use pageserver::{ - CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service, - task_mgr, virtual_file, + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http, + page_cache, page_service, task_mgr, virtual_file, }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; +use rustls_pki_types::{CertificateDer, PrivateKeyDer}; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -342,11 +344,25 @@ fn start_pageserver( info!("Starting pageserver http handler on {http_addr}"); let http_listener = tcp_listener::bind(http_addr)?; - let pg_addr = &conf.listen_pg_addr; + let https_listener = match conf.listen_https_addr.as_ref() { + Some(https_addr) => { + info!("Starting pageserver https handler on {https_addr}"); + Some(tcp_listener::bind(https_addr)?) + } + None => None, + }; + let pg_addr = &conf.listen_pg_addr; info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Enable SO_KEEPALIVE on the socket, to detect dead connections faster. + // These are configured via net.ipv4.tcp_keepalive_* sysctls. + // + // TODO: also set this on the walreceiver socket, but tokio-postgres doesn't + // support enabling keepalives while using the default OS sysctls. + setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?; + // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME @@ -567,9 +583,8 @@ fn start_pageserver( // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. - let http_endpoint_listener = { + let (http_endpoint_listener, https_endpoint_listener) = { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper - let cancel = CancellationToken::new(); let router_state = Arc::new( http::routes::State::new( @@ -584,22 +599,51 @@ fn start_pageserver( ) .context("Failed to initialize router state")?, ); + let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; - let service = http_utils::RouterService::new(router).unwrap(); - let server = hyper0::Server::from_tcp(http_listener)? - .serve(service) - .with_graceful_shutdown({ - let cancel = cancel.clone(); - async move { cancel.clone().cancelled().await } - }); - let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "http endpoint listener", - server, - )); - HttpEndpointListener(CancellableTask { task, cancel }) + let service = + Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?); + + let http_task = { + let server = + http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?; + let cancel = CancellationToken::new(); + + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "http endpoint listener", + server.serve(cancel.clone()), + )); + HttpEndpointListener(CancellableTask { task, cancel }) + }; + + let https_task = match https_listener { + Some(https_listener) => { + let certs = load_certs(&conf.ssl_cert_file)?; + let key = load_private_key(&conf.ssl_key_file)?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key)?; + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let server = + http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; + let cancel = CancellationToken::new(); + + let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "https endpoint listener", + server.serve(cancel.clone()), + )); + Some(HttpsEndpointListener(CancellableTask { task, cancel })) + } + None => None, + }; + + (http_task, https_task) }; let consumption_metrics_tasks = { @@ -675,6 +719,7 @@ fn start_pageserver( shutdown_pageserver.cancel(); pageserver::shutdown_pageserver( http_endpoint_listener, + https_endpoint_listener, page_service, consumption_metrics_tasks, disk_usage_eviction_task, @@ -689,6 +734,25 @@ fn start_pageserver( }) } +fn load_certs(filename: &Utf8Path) -> std::io::Result>> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + rustls_pemfile::certs(&mut reader).collect() +} + +fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + let key = rustls_pemfile::private_key(&mut reader)?; + + key.ok_or(anyhow::anyhow!( + "no private key found in {}", + filename.as_str(), + )) +} + async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 64d00882b9..562a16a14e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -53,6 +53,11 @@ pub struct PageServerConf { pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, + /// Example: 127.0.0.1:9899 + pub listen_https_addr: Option, + + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, /// Current availability zone. Used for traffic metrics. pub availability_zone: Option, @@ -194,6 +199,13 @@ pub struct PageServerConf { /// Interpreted protocol feature: if enabled, validate that the logical WAL received from /// safekeepers does not have gaps. pub validate_wal_contiguity: bool, + + /// When set, the previously written to disk heatmap is loaded on tenant attach and used + /// to avoid clobbering the heatmap from new, cold, attached locations. + pub load_previous_heatmap: bool, + + /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline. + pub generate_unarchival_heatmap: bool, } /// Token for authentication to safekeepers @@ -310,6 +322,9 @@ impl PageServerConf { let pageserver_api::config::ConfigToml { listen_pg_addr, listen_http_addr, + listen_https_addr, + ssl_key_file, + ssl_cert_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -358,6 +373,8 @@ impl PageServerConf { get_vectored_concurrent_io, enable_read_path_debugging, validate_wal_contiguity, + load_previous_heatmap, + generate_unarchival_heatmap, } = config_toml; let mut conf = PageServerConf { @@ -366,6 +383,9 @@ impl PageServerConf { // ------------------------------------------------------------ listen_pg_addr, listen_http_addr, + listen_https_addr, + ssl_key_file, + ssl_cert_file, availability_zone, wait_lsn_timeout, wal_redo_timeout, @@ -447,6 +467,8 @@ impl PageServerConf { no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), + load_previous_heatmap: load_previous_heatmap.unwrap_or(true), + generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true), }; // ------------------------------------------------------------ @@ -480,7 +502,9 @@ impl PageServerConf { #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); - Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) + + let test_id = uuid::Uuid::new_v4(); + Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { @@ -493,6 +517,8 @@ impl PageServerConf { metric_collection_interval: Duration::from_secs(60), synthetic_size_calculation_interval: Duration::from_secs(60), background_task_maximum_delay: Duration::ZERO, + load_previous_heatmap: Some(true), + generate_unarchival_heatmap: Some(true), ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index da9c095a15..e2a84d0c24 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -89,16 +89,112 @@ //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. -use crate::task_mgr::TaskKind; +use std::sync::Arc; + +use once_cell::sync::Lazy; +use tracing::warn; +use utils::{id::TimelineId, shard::TenantShardId}; + +use crate::{ + metrics::{StorageIoSizeMetrics, TimelineMetrics}, + task_mgr::TaskKind, + tenant::Timeline, +}; // The main structure of this module, see module-level comment. -#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, read_path_debug: bool, + scope: Scope, +} + +#[derive(Clone)] +pub(crate) enum Scope { + Global { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, + SecondaryTenant { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, + SecondaryTimeline { + io_size_metrics: crate::metrics::StorageIoSizeMetrics, + }, + Timeline { + // We wrap the `Arc`s inside another Arc to avoid child + // context creation contending for the ref counters of the Arc, + // which are shared among all tasks that operate on the timeline, especially + // concurrent page_service connections. + #[allow(clippy::redundant_allocation)] + arc_arc: Arc>, + }, + #[cfg(test)] + UnitTest { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, +} + +static GLOBAL_IO_SIZE_METRICS: Lazy = + Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*")); + +impl Scope { + pub(crate) fn new_global() -> Self { + Scope::Global { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } + /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start + /// of a compaction iteration. + pub(crate) fn new_timeline(timeline: &Timeline) -> Self { + Scope::Timeline { + arc_arc: Arc::new(Arc::clone(&timeline.metrics)), + } + } + pub(crate) fn new_page_service_pagestream( + timeline_handle: &crate::tenant::timeline::handle::Handle< + crate::page_service::TenantManagerTypes, + >, + ) -> Self { + Scope::Timeline { + arc_arc: Arc::clone(&timeline_handle.metrics), + } + } + pub(crate) fn new_secondary_timeline( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Self { + // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle. + + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let timeline_id = timeline_id.to_string(); + + let io_size_metrics = + crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + Scope::SecondaryTimeline { io_size_metrics } + } + pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self { + // Before propagating metrics via RequestContext, the labels were inferred from file path. + // The only user of VirtualFile at tenant scope is the heatmap download & read. + // The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*). + // Thus, we do the same here, and extend that for anything secondary-tenant scoped. + // + // If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future, + // we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown, + // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile + // at this point, so, we were able to completely side-step tenant-scoped stuff there). + Scope::SecondaryTenant { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } + #[cfg(test)] + pub(crate) fn new_unit_test() -> Self { + Scope::UnitTest { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } } /// The kind of access to the page cache. @@ -157,6 +253,7 @@ impl RequestContextBuilder { access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, read_path_debug: false, + scope: Scope::new_global(), }, } } @@ -171,10 +268,16 @@ impl RequestContextBuilder { access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, read_path_debug: original.read_path_debug, + scope: original.scope.clone(), }, } } + pub fn task_kind(mut self, k: TaskKind) -> Self { + self.inner.task_kind = k; + self + } + /// Configure the DownloadBehavior of the context: whether to /// download missing layers, and/or warn on the download. pub fn download_behavior(mut self, b: DownloadBehavior) -> Self { @@ -199,6 +302,11 @@ impl RequestContextBuilder { self } + pub(crate) fn scope(mut self, s: Scope) -> Self { + self.inner.scope = s; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -281,7 +389,50 @@ impl RequestContext { } fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - Self::new(task_kind, download_behavior) + RequestContextBuilder::extend(self) + .task_kind(task_kind) + .download_behavior(download_behavior) + .build() + } + + pub fn with_scope_timeline(&self, timeline: &Arc) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_timeline(timeline)) + .build() + } + + pub(crate) fn with_scope_page_service_pagestream( + &self, + timeline_handle: &crate::tenant::timeline::handle::Handle< + crate::page_service::TenantManagerTypes, + >, + ) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_page_service_pagestream(timeline_handle)) + .build() + } + + pub fn with_scope_secondary_timeline( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id)) + .build() + } + + pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self { + RequestContextBuilder::extend(self) + .scope(Scope::new_secondary_tenant(tenant_shard_id)) + .build() + } + + #[cfg(test)] + pub fn with_scope_unit_test(&self) -> Self { + RequestContextBuilder::new(TaskKind::UnitTest) + .scope(Scope::new_unit_test()) + .build() } pub fn task_kind(&self) -> TaskKind { @@ -303,4 +454,38 @@ impl RequestContext { pub(crate) fn read_path_debug(&self) -> bool { self.read_path_debug } + + pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics { + match &self.scope { + Scope::Global { io_size_metrics } => { + let is_unit_test = cfg!(test); + let is_regress_test_build = cfg!(feature = "testing"); + if is_unit_test || is_regress_test_build { + panic!("all VirtualFile instances are timeline-scoped"); + } else { + use once_cell::sync::Lazy; + use std::sync::Mutex; + use std::time::Duration; + use utils::rate_limit::RateLimit; + static LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); + let mut guard = LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + warn!( + %rate_limit_stats, + backtrace=%std::backtrace::Backtrace::force_capture(), + "all VirtualFile instances are timeline-scoped", + ); + }); + + io_size_metrics + } + } + Scope::Timeline { arc_arc } => &arc_arc.storage_io_size, + Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics, + Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, + #[cfg(test)] + Scope::UnitTest { io_size_metrics } => io_size_metrics, + } + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 8462594607..745d04cf62 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -84,6 +84,7 @@ impl ControllerUpcallClient { }) } + #[tracing::instrument(skip_all)] async fn retry_http_forever( &self, url: &url::Url, @@ -108,7 +109,7 @@ impl ControllerUpcallClient { |_| false, 3, u32::MAX, - "calling control plane generation validation API", + "storage controller upcall", &self.cancel, ) .await @@ -125,11 +126,12 @@ impl ControllerUpcallClient { impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn re_attach( &self, conf: &PageServerConf, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); @@ -179,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, - listen_https_port: None, // TODO: Support https. + listen_https_port: m.https_port, availability_zone_id: az_id.expect("Checked above"), }) } @@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; + let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { } /// Block until we get a successful response, or error out if we are shut down + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, ) -> Result, RetryForeverError> { - let re_attach_path = self + let url = self .base_url .join("validate") .expect("Failed to build validate path"); @@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = - self.retry_http_forever(&re_attach_path, request).await?; + let response: ValidateResponse = self.retry_http_forever(&url, request).await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 12252739fd..0fb9a240d5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -842,6 +842,12 @@ paths: required: false schema: type: integer + - name: recurse + description: When set, will recurse with the downloads into ancestor timelines + in: query + required: false + schema: + type: boolean post: description: | Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dd5a24a41f..77bfab47e0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -37,7 +37,8 @@ use pageserver_api::models::{ TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, - TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, + TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem, + TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; @@ -54,6 +55,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use crate::config::PageServerConf; +use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -63,6 +65,7 @@ use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, }; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ download_index_part, list_remote_tenant_shards, list_remote_timelines, }; @@ -481,6 +484,7 @@ async fn build_timeline_info_common( state, is_archived: Some(is_archived), + rel_size_migration: Some(timeline.get_rel_size_v2_status()), walreceiver_status, }; @@ -857,6 +861,75 @@ async fn timeline_archival_config_handler( json_response(StatusCode::OK, ()) } +/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency +/// measure only. +/// +/// Some examples of safe patches: +/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors. +/// - Force set the index part to use reldir v2 (migrating/migrated). +/// +/// Some examples of unsafe patches: +/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause +/// errors. +/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background. +async fn timeline_patch_index_part_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?; + check_permission(&request, None)?; // require global permission for this request + let state = get_state(&request); + + async { + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if let Some(rel_size_migration) = request_data.rel_size_migration { + timeline + .update_rel_size_v2_status(rel_size_migration) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(gc_compaction_last_completed_lsn) = + request_data.gc_compaction_last_completed_lsn + { + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: gc_compaction_last_completed_lsn, + }) + .map_err(ApiError::InternalServerError)?; + } + + if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn { + { + let guard = timeline.applied_gc_cutoff_lsn.lock_for_write(); + guard.store_and_unlock(applied_gc_cutoff_lsn); + } + } + + if request_data.force_index_update { + timeline + .remote_client + .force_schedule_index_upload() + .context("force schedule index upload") + .map_err(ApiError::InternalServerError)?; + } + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_patch_index_part", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -881,12 +954,13 @@ async fn timeline_detail_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; let timeline = tenant.get_timeline(timeline_id, false)?; + let ctx = &ctx.with_scope_timeline(&timeline); let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), force_await_initial_logical_size.unwrap_or(false), - &ctx, + ctx, ) .await .context("get local timeline info") @@ -927,11 +1001,11 @@ async fn get_lsn_by_timestamp_handler( let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; @@ -1000,10 +1074,11 @@ async fn get_timestamp_of_lsn_handler( .with_context(|| format!("Invalid LSN: {lsn_str:?}")) .map_err(ApiError::BadRequest)?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -1358,7 +1433,8 @@ async fn timeline_layer_scan_disposable_keys( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let guard = timeline.layers.read().await; let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else { @@ -1368,7 +1444,7 @@ async fn timeline_layer_scan_disposable_keys( }; let resident_layer = layer - .download_and_keep_resident() + .download_and_keep_resident(&ctx) .await .map_err(|err| match err { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -1436,6 +1512,7 @@ async fn timeline_download_heatmap_layers_handler( let desired_concurrency = parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; @@ -1443,6 +1520,8 @@ async fn timeline_download_heatmap_layers_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let max_concurrency = get_config(&request) .remote_storage_config @@ -1451,7 +1530,7 @@ async fn timeline_download_heatmap_layers_handler( .unwrap_or(DEFAULT_MAX_CONCURRENCY); let concurrency = std::cmp::min(max_concurrency, desired_concurrency); - timeline.start_heatmap_layers_download(concurrency).await?; + timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?; json_response(StatusCode::ACCEPTED, ()) } @@ -1490,8 +1569,10 @@ async fn layer_download_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let downloaded = timeline - .download_layer(&layer_name) + .download_layer(&layer_name, &ctx) .await .map_err(|e| match e { tenant::storage_layer::layer::DownloadError::TimelineShutdown @@ -2225,8 +2306,8 @@ async fn timeline_compact_handler( .unwrap_or(false); async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if scheduled { let tenant = state .tenant_manager @@ -2333,8 +2414,8 @@ async fn timeline_checkpoint_handler( parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); if wait_until_flushed { timeline.freeze_and_flush().await } else { @@ -2389,7 +2470,9 @@ async fn timeline_download_remote_layers_handler_post( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - match timeline.spawn_download_all_remote_layers(body).await { + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); + match timeline.spawn_download_all_remote_layers(body, &ctx).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), } @@ -2471,6 +2554,7 @@ async fn timeline_detach_ancestor_handler( tracing::info!("all timeline upload queues are drained"); let timeline = tenant.get_timeline(timeline_id, true)?; + let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline .prepare_to_detach_from_ancestor(&tenant, options, ctx) @@ -2577,8 +2661,9 @@ async fn getpage_at_lsn_handler_inner( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); // Enable read path debugging - let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true) + .scope(context::Scope::new_timeline(&timeline)).build(); // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); @@ -2612,8 +2697,8 @@ async fn timeline_collect_keyspace( let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline); let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) @@ -3250,7 +3335,7 @@ async fn put_tenant_timeline_import_basebackup( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = tenant + let (timeline, timeline_ctx) = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .map_err(ApiError::InternalServerError) .await?; @@ -3269,7 +3354,13 @@ async fn put_tenant_timeline_import_basebackup( info!("importing basebackup"); timeline - .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx) + .import_basebackup_from_tar( + tenant.clone(), + &mut body, + base_lsn, + broker_client, + &timeline_ctx, + ) .await .map_err(ApiError::InternalServerError)?; @@ -3309,6 +3400,7 @@ async fn put_tenant_timeline_import_wal( let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; + let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build(); let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { @@ -3625,6 +3717,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part", + |r| api_handler(r, timeline_patch_index_part_handler), + ) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", |r| api_handler(r, lsn_lease_handler), diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 02767055fb..8373d0bd87 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -64,6 +64,7 @@ pub struct CancellableTask { pub cancel: CancellationToken, } pub struct HttpEndpointListener(pub CancellableTask); +pub struct HttpsEndpointListener(pub CancellableTask); pub struct ConsumptionMetricsTasks(pub CancellableTask); pub struct DiskUsageEvictionTask(pub CancellableTask); impl CancellableTask { @@ -77,6 +78,7 @@ impl CancellableTask { #[allow(clippy::too_many_arguments)] pub async fn shutdown_pageserver( http_listener: HttpEndpointListener, + https_listener: Option, page_service: page_service::Listener, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, @@ -213,6 +215,15 @@ pub async fn shutdown_pageserver( ) .await; + if let Some(https_listener) = https_listener { + timed( + https_listener.0.shutdown(), + "shutdown https", + Duration::from_secs(1), + ) + .await; + } + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index eb8a9b8e24..fd90ef8cd7 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_batch_global", + "Layers visited to serve a single read batch (read amplification), regardless of number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + +pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_per_read_amortized_global", + "Layers visited to serve a single read (read amplification). Amortized across a batch: \ + all visited layers are divided by number of reads.", + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], + ) + .expect("failed to define a metric") +}); + pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { // We expect this to be low because of Postgres checkpoints. Let's see if that holds. register_histogram!( @@ -1204,11 +1227,24 @@ impl StorageIoTime { pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(StorageIoTime::new); -const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; +#[derive(Clone, Copy)] +#[repr(usize)] +enum StorageIoSizeOperation { + Read, + Write, +} + +impl StorageIoSizeOperation { + const VARIANTS: &'static [&'static str] = &["read", "write"]; + + fn as_str(&self) -> &'static str { + Self::VARIANTS[*self as usize] + } +} // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 -pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( +static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "shard_id", "timeline_id"] @@ -1216,6 +1252,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(Clone, Debug)] +pub(crate) struct StorageIoSizeMetrics { + pub read: UIntGauge, + pub write: UIntGauge, +} + +impl StorageIoSizeMetrics { + pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { + let read = STORAGE_IO_SIZE + .get_metric_with_label_values(&[ + StorageIoSizeOperation::Read.as_str(), + tenant_id, + shard_id, + timeline_id, + ]) + .unwrap(); + let write = STORAGE_IO_SIZE + .get_metric_with_label_values(&[ + StorageIoSizeOperation::Write.as_str(), + tenant_id, + shard_id, + timeline_id, + ]) + .unwrap(); + Self { read, write } + } +} + #[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -2798,6 +2862,7 @@ pub(crate) struct TimelineMetrics { /// Number of valid LSN leases. pub valid_lsn_lease_count_gauge: UIntGauge, pub wal_records_received: IntCounter, + pub storage_io_size: StorageIoSizeMetrics, shutdown: std::sync::atomic::AtomicBool, } @@ -2933,6 +2998,8 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + TimelineMetrics { tenant_id, shard_id, @@ -2962,6 +3029,7 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + storage_io_size, valid_lsn_lease_count_gauge, wal_records_received, shutdown: std::sync::atomic::AtomicBool::default(), @@ -3152,7 +3220,7 @@ impl TimelineMetrics { ]); } - for op in STORAGE_IO_SIZE_OPERATIONS { + for op in StorageIoSizeOperation::VARIANTS { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } @@ -4074,6 +4142,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ &LAYERS_PER_READ_GLOBAL, + &LAYERS_PER_READ_BATCH_GLOBAL, + &LAYERS_PER_READ_AMORTIZED_GLOBAL, &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8972515163..f2d2ab05ad 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -56,6 +56,7 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, + TimelineMetrics, }; use crate::pgdatadir_mapping::Version; use crate::span::{ @@ -392,10 +393,6 @@ impl TimelineHandles { .await .map_err(|e| match e { timeline::handle::GetError::TenantManager(e) => e, - timeline::handle::GetError::TimelineGateClosed => { - trace!("timeline gate closed"); - GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) - } timeline::handle::GetError::PerTimelineStateShutDown => { trace!("per-timeline state shut down"); GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown) @@ -422,24 +419,36 @@ pub(crate) struct TenantManagerTypes; impl timeline::handle::Types for TenantManagerTypes { type TenantManagerError = GetActiveTimelineError; type TenantManager = TenantManagerWrapper; - type Timeline = Arc; + type Timeline = TenantManagerCacheItem; } -impl timeline::handle::ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } +pub(crate) struct TenantManagerCacheItem { + pub(crate) timeline: Arc, + // allow() for cheap propagation through RequestContext inside a task + #[allow(clippy::redundant_allocation)] + pub(crate) metrics: Arc>, + #[allow(dead_code)] // we store it to keep the gate open + pub(crate) gate_guard: GateGuard, +} +impl std::ops::Deref for TenantManagerCacheItem { + type Target = Arc; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl timeline::handle::Timeline for TenantManagerCacheItem { fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId { - Timeline::shard_timeline_id(self) + Timeline::shard_timeline_id(&self.timeline) } fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState { - &self.handles + &self.timeline.handles } fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity { - Timeline::get_shard_identity(self) + Timeline::get_shard_identity(&self.timeline) } } @@ -448,7 +457,7 @@ impl timeline::handle::TenantManager for TenantManagerWrappe &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> Result, GetActiveTimelineError> { + ) -> Result { let tenant_id = self.tenant_id.get().expect("we set this in get()"); let timeout = ACTIVE_TENANT_TIMEOUT; let wait_start = Instant::now(); @@ -491,7 +500,23 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - Ok(timeline) + + let gate_guard = match timeline.gate.enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetActiveTimelineError::Timeline( + GetTimelineError::ShuttingDown, + )); + } + }; + + let metrics = Arc::new(Arc::clone(&timeline.metrics)); + + Ok(TenantManagerCacheItem { + timeline, + metrics, + gate_guard, + }) } } @@ -1220,6 +1245,14 @@ impl PageServerHandler { ), QueryError, > { + macro_rules! upgrade_handle_and_set_context { + ($shard:ident) => {{ + let weak_handle = &$shard; + let handle = weak_handle.upgrade()?; + let ctx = ctx.with_scope_page_service_pagestream(&handle); + (handle, ctx) + }}; + } Ok(match batch { BatchedFeMessage::Exists { span, @@ -1228,9 +1261,10 @@ impl PageServerHandler { req, } => { fail::fail_point!("ps::handle-pagerequest-message::exists"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ - self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) + self.handle_get_rel_exists_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1246,9 +1280,10 @@ impl PageServerHandler { req, } => { fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ - self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) + self.handle_get_nblocks_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1264,17 +1299,18 @@ impl PageServerHandler { pages, } => { fail::fail_point!("ps::handle-pagerequest-message::getpage"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( { let npages = pages.len(); trace!(npages, "handling getpage request"); let res = self .handle_get_page_at_lsn_request_batched( - &*shard.upgrade()?, + &shard, effective_request_lsn, pages, io_concurrency, - ctx, + &ctx, ) .instrument(span.clone()) .await; @@ -1291,9 +1327,10 @@ impl PageServerHandler { req, } => { fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ - self.handle_db_size_request(&*shard.upgrade()?, &req, ctx) + self.handle_db_size_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1309,9 +1346,10 @@ impl PageServerHandler { req, } => { fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( vec![ - self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) + self.handle_get_slru_segment_request(&shard, &req, &ctx) .instrument(span.clone()) .await .map(|msg| (msg, timer)) @@ -1327,12 +1365,13 @@ impl PageServerHandler { requests, } => { fail::fail_point!("ps::handle-pagerequest-message::test"); + let (shard, ctx) = upgrade_handle_and_set_context!(shard); ( { let npages = requests.len(); trace!(npages, "handling getpage request"); let res = self - .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .handle_test_request_batch(&shard, requests, &ctx) .instrument(span.clone()) .await; assert_eq!(res.len(), npages); @@ -2095,6 +2134,7 @@ impl PageServerHandler { // TODO: passthrough the error site to the final error message? BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), + BasebackupError::Shutdown => QueryError::Shutdown, } } @@ -2107,6 +2147,7 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; set_tracing_field_shard_id(&timeline); + let ctx = ctx.with_scope_timeline(&timeline); if timeline.is_archived() == Some(true) { tracing::info!( @@ -2124,7 +2165,7 @@ impl PageServerHandler { lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, crate::tenant::timeline::WaitLsnTimeout::Default, - ctx, + &ctx, ) .await?; timeline @@ -2150,7 +2191,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; @@ -2173,7 +2214,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; @@ -2190,7 +2231,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, - ctx, + &ctx, ) .await .map_err(map_basebackup_error)?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 787b1b895c..4685f9383b 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -21,6 +21,7 @@ use pageserver_api::key::{ slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::RelSizeMigration; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -492,7 +493,9 @@ impl Timeline { // Otherwise, read the old reldir keyspace. // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. - if self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Migrated | RelSizeMigration::Migrating = + self.get_rel_size_v2_status() + { // fetch directory listing (new) let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) @@ -544,7 +547,7 @@ impl Timeline { forknum: *forknum, })); - if !self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() { return Ok(rels_v1); } @@ -599,28 +602,36 @@ impl Timeline { let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; - let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); - for blkno in 0..n_blocks { - let block = self - .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) - .await?; - segment.extend_from_slice(&block[..BLCKSZ as usize]); - } - Ok(segment.freeze()) - } - /// Look up given SLRU page version. - pub(crate) async fn get_slru_page_at_lsn( - &self, - kind: SlruKind, - segno: u32, - blknum: BlockNumber, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result { - assert!(self.tenant_shard_id.is_shard_zero()); - let key = slru_block_to_key(kind, segno, blknum); - self.get(key, lsn, ctx).await + let keyspace = KeySpace::single( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for batch in batches.parts { + let blocks = self + .get_vectored(batch, lsn, io_concurrency.clone(), ctx) + .await?; + + for (_key, block) in blocks { + let block = block?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); + } + } + + Ok(segment.freeze()) } /// Get size of an SLRU segment @@ -829,19 +840,41 @@ impl Timeline { let nblocks = self .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) .await?; - for blknum in (0..nblocks).rev() { - let clog_page = self - .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx) + + let keyspace = KeySpace::single( + slru_block_to_key(SlruKind::Clog, segno, 0) + ..slru_block_to_key(SlruKind::Clog, segno, nblocks), + ); + + let batches = keyspace.partition( + self.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + + for batch in batches.parts.into_iter().rev() { + let blocks = self + .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx) .await?; - if clog_page.len() == BLCKSZ as usize + 8 { - let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); - let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + for (_key, clog_page) in blocks.into_iter().rev() { + let clog_page = clog_page?; - match f(timestamp) { - ControlFlow::Break(b) => return Ok(b), - ControlFlow::Continue(()) => (), + if clog_page.len() == BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + match f(timestamp) { + ControlFlow::Break(b) => return Ok(b), + ControlFlow::Continue(()) => (), + } } } } @@ -1052,6 +1085,8 @@ impl Timeline { ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) }); + // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; let dbdir = DbDirectory::des(&buf)?; @@ -1718,6 +1753,35 @@ impl DatadirModification<'_> { Ok(()) } + /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that + /// we enable it, we also need to persist it in `index_part.json`. + pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result { + let status = self.tline.get_rel_size_v2_status(); + let config = self.tline.get_rel_size_v2_enabled(); + match (config, status) { + (false, RelSizeMigration::Legacy) => { + // tenant config didn't enable it and we didn't write any reldir_v2 key yet + Ok(false) + } + (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + Ok(true) + } + (true, RelSizeMigration::Legacy) => { + // The first time we enable it, we need to persist it in `index_part.json` + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrating)?; + tracing::info!("enabled rel_size_v2"); + Ok(true) + } + (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + // and we don't need to do anything + Ok(true) + } + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1726,6 +1790,8 @@ impl DatadirModification<'_> { img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; + // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; @@ -1746,7 +1812,7 @@ impl DatadirModification<'_> { })?; self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } @@ -1898,12 +1964,12 @@ impl DatadirModification<'_> { .context("deserialize db")? }; - // Add the new relation to the rel directory entry, and write it back - if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); - } + let v2_enabled = self.maybe_enable_rel_size_v2()?; - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { + if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 @@ -1938,6 +2004,10 @@ impl DatadirModification<'_> { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); } else { + // Add the new relation to the rel directory entry, and write it back + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { + return Err(RelationError::AlreadyExists); + } if !dbdir_exists { self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) @@ -1951,6 +2021,7 @@ impl DatadirModification<'_> { )), ); } + // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -2029,6 +2100,7 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2041,7 +2113,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; true - } else if self.tline.get_rel_size_v2_enabled() { + } else if v2_enabled { // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion // logic). @@ -2072,7 +2144,7 @@ impl DatadirModification<'_> { // Remove entry from relation size cache self.tline.remove_cached_rel_size(&rel_tag); - // Delete size entry, as well as all blocks + // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage. self.delete(rel_key_range(rel_tag)); } } @@ -2686,7 +2758,7 @@ mod tests { TimelineId::from_array(hex!("11223344556677881122334455667788")); let (tenant, ctx) = harness.load().await; - let tline = tenant + let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 71dc3c9075..3a34c8e254 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,8 +31,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use itertools::Itertools as _; use once_cell::sync::Lazy; -use pageserver_api::models; pub use pageserver_api::models::TenantState; +use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, @@ -77,6 +77,8 @@ use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; use crate::config::PageServerConf; +use crate::context; +use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; @@ -1114,7 +1116,7 @@ impl Tenant { } }; - let timeline = self.create_timeline_struct( + let (timeline, timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1123,6 +1125,8 @@ impl Tenant { CreateTimelineCause::Load, idempotency.clone(), index_part.gc_compaction.clone(), + index_part.rel_size_migration.clone(), + ctx, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1149,16 +1153,19 @@ impl Tenant { // a previous heatmap which contains all visible layers in the layer map. // This previous heatmap will be used whenever a fresh heatmap is generated // for the timeline. - if matches!(cause, LoadTimelineCause::Unoffload) { + if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) { let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); while let Some((tline, end_lsn)) = tline_ending_at { let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; - if !tline.is_previous_heatmap_active() { + // Another unearchived timeline might have generated a heatmap for this ancestor. + // If the current branch point greater than the previous one use the the heatmap + // we just generated - it should include more layers. + if !tline.should_keep_previous_heatmap(end_lsn) { tline .previous_heatmap .store(Some(Arc::new(unarchival_heatmap))); } else { - tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.") } match tline.ancestor_timeline() { @@ -1253,7 +1260,7 @@ impl Tenant { match activate { ActivateTimelineArgs::Yes { broker_client } => { info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, ctx); + timeline.activate(self.clone(), broker_client, None, &timeline_ctx); } ActivateTimelineArgs::No => (), } @@ -1578,6 +1585,10 @@ impl Tenant { } async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { + if !self.conf.load_previous_heatmap { + return None; + } + let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); match tokio::fs::read_to_string(on_disk_heatmap_path).await { Ok(heatmap) => match serde_json::from_str::(&heatmap) { @@ -1757,6 +1768,7 @@ impl Tenant { import_pgdata, ActivateTimelineArgs::No, guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); } } @@ -1774,6 +1786,7 @@ impl Tenant { timeline_id, &index_part.metadata, remote_timeline_client, + ctx, ) .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await @@ -1939,6 +1952,7 @@ impl Tenant { hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { heatmap: h, read_at: hs.1, + end_lsn: None, }) }); part_downloads.spawn( @@ -2210,7 +2224,7 @@ impl Tenant { self.clone(), broker_client.clone(), background_jobs_can_start, - &ctx, + &ctx.with_scope_timeline(&timeline), ); } @@ -2407,8 +2421,8 @@ impl Tenant { new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, - _ctx: &RequestContext, - ) -> anyhow::Result { + ctx: &RequestContext, + ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> { anyhow::ensure!( self.is_active(), "Cannot create empty timelines on inactive tenant" @@ -2442,6 +2456,8 @@ impl Tenant { create_guard, initdb_lsn, None, + None, + ctx, ) .await } @@ -2459,7 +2475,7 @@ impl Tenant { pg_version: u32, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_tl = self + let (uninit_tl, ctx) = self .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) .await?; let tline = uninit_tl.raw_timeline().expect("we just created it"); @@ -2471,7 +2487,7 @@ impl Tenant { .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification - .commit(ctx) + .commit(&ctx) .await .context("commit init_empty_test_timeline modification")?; @@ -2497,6 +2513,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, + in_memory_layer_desc: Vec, delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, @@ -2518,6 +2535,11 @@ impl Tenant { .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) .await?; } + for in_memory in in_memory_layer_desc { + tline + .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx) + .await?; + } let layer_names = tline .layers .read() @@ -2683,7 +2705,12 @@ impl Tenant { // doing stuff before the IndexPart is durable in S3, which is done by the previous section. let activated_timeline = match result { CreateTimelineResult::Created(timeline) => { - timeline.activate(self.clone(), broker_client, None, ctx); + timeline.activate( + self.clone(), + broker_client, + None, + &ctx.with_scope_timeline(&timeline), + ); timeline } CreateTimelineResult::Idempotent(timeline) => { @@ -2745,10 +2772,9 @@ impl Tenant { } }; - let mut uninit_timeline = { + let (mut uninit_timeline, timeline_ctx) = { let this = &self; let initdb_lsn = Lsn(0); - let _ctx = ctx; async move { let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to @@ -2767,6 +2793,8 @@ impl Tenant { timeline_create_guard, initdb_lsn, None, + None, + ctx, ) .await } @@ -2796,6 +2824,7 @@ impl Tenant { index_part, activate, timeline_create_guard, + timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); // NB: the timeline doesn't exist in self.timelines at this point @@ -2809,6 +2838,7 @@ impl Tenant { index_part: import_pgdata::index_part_format::Root, activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, + ctx: RequestContext, ) { debug_assert_current_span_has_tenant_and_timeline_id(); info!("starting"); @@ -2820,6 +2850,7 @@ impl Tenant { index_part, activate, timeline_create_guard, + ctx, ) .await; if let Err(err) = &res { @@ -2835,9 +2866,8 @@ impl Tenant { index_part: import_pgdata::index_part_format::Root, activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, + ctx: RequestContext, ) -> Result<(), anyhow::Error> { - let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn); - info!("importing pgdata"); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await @@ -3046,6 +3076,7 @@ impl Tenant { let mut has_pending_l0 = false; for timeline in compact_l0 { + let ctx = &ctx.with_scope_timeline(&timeline); let outcome = timeline .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) @@ -3079,6 +3110,7 @@ impl Tenant { if !timeline.is_active() { continue; } + let ctx = &ctx.with_scope_timeline(&timeline); let mut outcome = timeline .compact(cancel, EnumSet::default(), ctx) @@ -3141,11 +3173,13 @@ impl Tenant { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { match err { + err if err.is_cancel() => {} CompactionError::ShuttingDown => (), // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { + // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. self.compaction_circuit_breaker .lock() .unwrap() @@ -3302,7 +3336,7 @@ impl Tenant { self.clone(), broker_client.clone(), background_jobs_can_start, - ctx, + &ctx.with_scope_timeline(timeline), ); activated_timelines += 1; } @@ -4116,7 +4150,9 @@ impl Tenant { cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, gc_compaction_state: Option, - ) -> anyhow::Result> { + rel_size_v2_status: Option, + ctx: &RequestContext, + ) -> anyhow::Result<(Arc, RequestContext)> { let state = match cause { CreateTimelineCause::Load => { let ancestor_id = new_metadata.ancestor_timeline(); @@ -4148,10 +4184,15 @@ impl Tenant { self.attach_wal_lag_cooldown.clone(), create_idempotency, gc_compaction_state, + rel_size_v2_status, self.cancel.child_token(), ); - Ok(timeline) + let timeline_ctx = RequestContextBuilder::extend(ctx) + .scope(context::Scope::new_timeline(&timeline)) + .build(); + + Ok((timeline, timeline_ctx)) } /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object @@ -4567,6 +4608,7 @@ impl Tenant { // Ensures all timelines use the same start time when computing the time cutoff. let now_ts_for_pitr_calc = SystemTime::now(); for timeline in timelines.iter() { + let ctx = &ctx.with_scope_timeline(timeline); let cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) @@ -4740,7 +4782,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> Result { let src_id = src_timeline.timeline_id; @@ -4843,13 +4885,15 @@ impl Tenant { src_timeline.pg_version, ); - let uninitialized_timeline = self + let (uninitialized_timeline, _timeline_ctx) = self .prepare_new_timeline( dst_id, &metadata, timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + Some(src_timeline.get_rel_size_v2_status()), + ctx, ) .await?; @@ -5116,13 +5160,15 @@ impl Tenant { pgdata_lsn, pg_version, ); - let mut raw_timeline = self + let (mut raw_timeline, timeline_ctx) = self .prepare_new_timeline( timeline_id, &new_metadata, timeline_create_guard, pgdata_lsn, None, + None, + ctx, ) .await?; @@ -5133,7 +5179,7 @@ impl Tenant { &unfinished_timeline, &pgdata_path, pgdata_lsn, - ctx, + &timeline_ctx, ) .await .with_context(|| { @@ -5194,6 +5240,7 @@ impl Tenant { /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map. + #[allow(clippy::too_many_arguments)] async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, @@ -5201,15 +5248,17 @@ impl Tenant { create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, - ) -> anyhow::Result> { + rel_size_v2_status: Option, + ctx: &RequestContext, + ) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); resources .remote_client - .init_upload_queue_for_empty_remote(new_metadata)?; + .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?; - let timeline_struct = self + let (timeline_struct, timeline_ctx) = self .create_timeline_struct( new_timeline_id, new_metadata, @@ -5219,6 +5268,8 @@ impl Tenant { CreateTimelineCause::Load, create_guard.idempotency.clone(), None, + rel_size_v2_status, + ctx, ) .context("Failed to create timeline data structure")?; @@ -5239,10 +5290,13 @@ impl Tenant { "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}" ); - Ok(UninitializedTimeline::new( - self, - new_timeline_id, - Some((timeline_struct, create_guard)), + Ok(( + UninitializedTimeline::new( + self, + new_timeline_id, + Some((timeline_struct, create_guard)), + ), + timeline_ctx, )) } @@ -5777,7 +5831,8 @@ pub(crate) mod harness { } pub(crate) async fn load(&self) -> (Arc, RequestContext) { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + .with_scope_unit_test(); ( self.do_try_load(&ctx) .await @@ -5907,6 +5962,8 @@ mod tests { #[cfg(feature = "testing")] use timeline::GcInfo; #[cfg(feature = "testing")] + use timeline::InMemoryLayerTestDesc; + #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; @@ -6798,7 +6855,7 @@ mod tests { let (tenant, ctx) = harness.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); - let tline = tenant + let (tline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; let tline = tline.raw_timeline().unwrap(); @@ -7420,7 +7477,7 @@ mod tests { .await; let initdb_lsn = Lsn(0x20); - let utline = tenant + let (utline, ctx) = tenant .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx) .await?; let tline = utline.raw_timeline().unwrap(); @@ -7487,7 +7544,7 @@ mod tests { let harness = TenantHarness::create(name).await?; { let (tenant, ctx) = harness.load().await; - let tline = tenant + let (tline, _ctx) = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again @@ -7919,6 +7976,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN @@ -8006,6 +8064,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), // delta layers vec![( Lsn(0x20), @@ -8221,6 +8280,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8301,6 +8361,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8374,6 +8435,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers // delta layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( @@ -8506,6 +8568,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -8699,6 +8762,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x40), delta1, @@ -8755,6 +8819,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers Vec::new(), image_layers, end_lsn, @@ -8961,6 +9026,7 @@ mod tests { Lsn(0x08), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x08)..Lsn(0x10), @@ -8979,7 +9045,7 @@ mod tests { delta3, ), ], // delta layers - vec![], // image layers + vec![], // image layers Lsn(0x50), ) .await? @@ -8990,6 +9056,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range( Lsn(0x10)..Lsn(0x48), @@ -9540,6 +9607,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), @@ -9787,6 +9855,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + Vec::new(), // in-memory layers vec![ // delta1 and delta 2 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1), @@ -10022,6 +10091,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![], // delta layers vec![(Lsn(0x18), img_layer)], // image layers Lsn(0x18), @@ -10268,6 +10338,7 @@ mod tests { baseline_image_layer_lsn, DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![DeltaLayerTestDesc::new_with_inferred_key_range( delta_layer_start_lsn..delta_layer_end_lsn, delta_layer_spec, @@ -10299,6 +10370,158 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> { + let harness = + TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?; + let (tenant, ctx) = harness.load().await; + + let will_init_keys = [2, 6]; + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let mut expected_key_values = HashMap::new(); + + let baseline_image_layer_lsn = Lsn(0x10); + let mut baseline_img_layer = Vec::new(); + for i in 0..5 { + let key = get_key(i); + let value = format!("value {i}@{baseline_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + baseline_img_layer.push((key, Bytes::from(value))); + } + + let nested_image_layer_lsn = Lsn(0x50); + let mut nested_img_layer = Vec::new(); + for i in 5..10 { + let key = get_key(i); + let value = format!("value {i}@{nested_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + nested_img_layer.push((key, Bytes::from(value))); + } + + let frozen_layer = { + let lsn_range = Lsn(0x40)..Lsn(0x60); + let mut data = Vec::new(); + for i in 0..10 { + let key = get_key(i); + let key_in_nested = nested_img_layer + .iter() + .any(|(key_with_img, _)| *key_with_img == key); + let lsn = { + if key_in_nested { + Lsn(nested_image_layer_lsn.0 + 5) + } else { + lsn_range.start + } + }; + + let will_init = will_init_keys.contains(&i); + if will_init { + data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); + + expected_key_values.insert(key, "".to_string()); + } else { + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + } + } + + InMemoryLayerTestDesc { + lsn_range, + is_open: false, + data, + } + }; + + let (open_layer, last_record_lsn) = { + let start_lsn = Lsn(0x70); + let mut data = Vec::new(); + let mut end_lsn = Lsn(0); + for i in 0..10 { + let key = get_key(i); + let lsn = Lsn(start_lsn.0 + i as u64); + let delta = format!("@{lsn}"); + data.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + + end_lsn = std::cmp::max(end_lsn, lsn); + } + + ( + InMemoryLayerTestDesc { + lsn_range: start_lsn..Lsn::MAX, + is_open: true, + data, + }, + end_lsn, + ) + }; + + assert!( + nested_image_layer_lsn > frozen_layer.lsn_range.start + && nested_image_layer_lsn < frozen_layer.lsn_range.end + ); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + baseline_image_layer_lsn, + DEFAULT_PG_VERSION, + &ctx, + vec![open_layer, frozen_layer], // in-memory layers + Vec::new(), // delta layers + vec![ + (baseline_image_layer_lsn, baseline_img_layer), + (nested_image_layer_lsn, nested_img_layer), + ], // image layers + last_record_lsn, + ) + .await?; + + let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let results = tline + .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx) + .await + .expect("No vectored errors"); + for (key, res) in results { + let value = res.expect("No key errors"); + let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); + assert_eq!(value, Bytes::from(expected_value.clone())); + + tracing::info!("key={key} value={expected_value}"); + } + + Ok(()) + } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, @@ -10414,6 +10637,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), @@ -10798,6 +11022,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), @@ -11049,6 +11274,7 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, + vec![], // in-memory layers vec![ // delta1/2/4 only contain a single key but multiple updates DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index ad66f7b4a7..97978aefb9 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -382,7 +382,8 @@ pub(crate) mod tests { } async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed(blobs, compression, &ctx).await?; diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index cdee42239f..419befa41b 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -32,8 +32,7 @@ use hex; use thiserror::Error; use tracing::error; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::TaskKind; +use crate::context::RequestContext; use crate::tenant::block_io::{BlockReader, BlockWriter}; use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer}; @@ -478,16 +477,15 @@ where } #[allow(dead_code)] - pub async fn dump(&self) -> Result<()> { + pub async fn dump(&self, ctx: &RequestContext) -> Result<()> { let mut stack = Vec::new(); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); stack.push((self.root_blk, String::new(), 0, 0, 0)); let block_cursor = self.reader.block_cursor(); while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() { - let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?; + let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?; let buf: &[u8] = blk.as_ref(); let node = OnDiskNode::::deparse(buf)?; @@ -836,6 +834,8 @@ pub(crate) mod tests { use rand::Rng; use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; #[derive(Clone, Default)] @@ -870,7 +870,8 @@ pub(crate) mod tests { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let all_keys: Vec<&[u8; 6]> = vec![ b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", @@ -888,7 +889,7 @@ pub(crate) mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump().await?; + reader.dump(&ctx).await?; // Test the `get` function on all the keys. for (key, val) in all_data.iter() { @@ -980,7 +981,8 @@ pub(crate) mod tests { async fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); const NUM_KEYS: u64 = 1000; @@ -998,7 +1000,7 @@ pub(crate) mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump().await?; + reader.dump(&ctx).await?; use std::sync::Mutex; @@ -1168,7 +1170,8 @@ pub(crate) mod tests { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); for (key, val) in disk_btree_test_data::TEST_DATA { writer.append(&key, val)?; @@ -1199,7 +1202,7 @@ pub(crate) mod tests { .await?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); - reader.dump().await?; + reader.dump(&ctx).await?; Ok(()) } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 17d6acafd8..ee4eb15748 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -352,7 +352,8 @@ mod tests { let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?; - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); Ok((conf, tenant_shard_id, timeline_id, ctx)) } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 59f5a6bd90..2b04e53f10 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -62,8 +62,7 @@ use utils::lsn::Lsn; use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; +use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak}; /// /// LayerMap tracks what layers exist on a timeline. @@ -167,7 +166,7 @@ impl Drop for BatchedUpdates<'_> { /// Return value of LayerMap::search #[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { - pub layer: Arc, + pub layer: ReadableLayerWeak, pub lsn_floor: Lsn, } @@ -175,19 +174,37 @@ pub struct SearchResult { /// /// Contains a mapping from a layer description to a keyspace /// accumulator that contains all the keys which intersect the layer -/// from the original search space. Keys that were not found are accumulated -/// in a separate key space accumulator. +/// from the original search space. #[derive(Debug)] pub struct RangeSearchResult { pub found: HashMap, - pub not_found: KeySpaceAccum, } impl RangeSearchResult { fn new() -> Self { Self { found: HashMap::new(), - not_found: KeySpaceAccum::new(), + } + } + + fn map_to_in_memory_layer( + in_memory_layer: Option, + range: Range, + ) -> RangeSearchResult { + match in_memory_layer { + Some(inmem) => { + let search_result = SearchResult { + lsn_floor: inmem.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(inmem), + }; + + let mut accum = KeySpaceAccum::new(); + accum.add_range(range); + RangeSearchResult { + found: HashMap::from([(search_result, accum)]), + } + } + None => RangeSearchResult::new(), } } } @@ -199,6 +216,7 @@ struct RangeSearchCollector where Iter: Iterator>)>, { + in_memory_layer: Option, delta_coverage: Peekable, image_coverage: Peekable, key_range: Range, @@ -234,10 +252,12 @@ where fn new( key_range: Range, end_lsn: Lsn, + in_memory_layer: Option, delta_coverage: Iter, image_coverage: Iter, ) -> Self { Self { + in_memory_layer, delta_coverage: delta_coverage.peekable(), image_coverage: image_coverage.peekable(), key_range, @@ -266,8 +286,7 @@ where return self.result; } Some(layer_type) => { - // Changes for the range exist. Record anything before the first - // coverage change as not found. + // Changes for the range exist. let coverage_start = layer_type.next_change_at_key(); let range_before = self.key_range.start..coverage_start; self.pad_range(range_before); @@ -297,10 +316,22 @@ where self.result } - /// Mark a range as not found (i.e. no layers intersect it) + /// Map a range which does not intersect any persistent layers to + /// the in-memory layer candidate. fn pad_range(&mut self, key_range: Range) { if !key_range.is_empty() { - self.result.not_found.add_range(key_range); + if let Some(ref inmem) = self.in_memory_layer { + let search_result = SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()), + lsn_floor: inmem.get_lsn_range().start, + }; + + self.result + .found + .entry(search_result) + .or_default() + .add_range(key_range); + } } } @@ -310,6 +341,7 @@ where let selected = LayerMap::select_layer( self.current_delta.clone(), self.current_image.clone(), + self.in_memory_layer.clone(), self.end_lsn, ); @@ -365,6 +397,24 @@ where } } +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct InMemoryLayerDesc { + handle: InMemoryLayerHandle, + lsn_range: Range, +} + +impl InMemoryLayerDesc { + pub(crate) fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +enum InMemoryLayerHandle { + Open, + Frozen(usize), +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -394,69 +444,161 @@ impl LayerMap { /// layer result, or simplify the api to `get_latest_image` and /// `get_latest_delta`, and only call `get_latest_image` once. /// - /// NOTE: This only searches the 'historic' layers, *not* the - /// 'open' and 'frozen' layers! - /// pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + let in_memory_layer = self.search_in_memory_layer(end_lsn); + + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + return in_memory_layer.map(|desc| SearchResult { + lsn_floor: desc.get_lsn_range().start, + layer: ReadableLayerWeak::InMemoryLayer(desc), + }); + } + }; + let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - Self::select_layer(latest_delta, latest_image, end_lsn) + Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn) } + /// Select a layer from three potential candidates (in-memory, delta and image layer). + /// The candidates represent the first layer of each type which intersect a key range. + /// + /// Layer types have an in implicit priority (image > delta > in-memory). For instance, + /// if we have the option of reading an LSN range from both an image and a delta, we + /// should read from the image. fn select_layer( delta_layer: Option>, image_layer: Option>, + in_memory_layer: Option, end_lsn: Lsn, ) -> Option { assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); - match (delta_layer, image_layer) { - (None, None) => None, - (None, Some(image)) => { + match (delta_layer, image_layer, in_memory_layer) { + (None, None, None) => None, + (None, Some(image), None) => { let lsn_floor = image.get_lsn_range().start; Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor, }) } - (Some(delta), None) => { + (Some(delta), None, None) => { let lsn_floor = delta.get_lsn_range().start; Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::PersistentLayer(delta), lsn_floor, }) } - (Some(delta), Some(image)) => { + (Some(delta), Some(image), None) => { let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { Some(SearchResult { - layer: image, + layer: ReadableLayerWeak::PersistentLayer(image), + lsn_floor: img_lsn, + }) + } else { + // If the delta overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let lsn_floor = + std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(delta), + lsn_floor, + }) + } + } + (None, None, Some(inmem)) => { + let lsn_floor = inmem.get_lsn_range().start; + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + (None, Some(image), Some(inmem)) => { + // If the in-memory layer overlaps with the image in the LSN dimension, do a partial + // up to the image layer. + let img_lsn = image.get_lsn_range().start; + let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end; + let image_exact_match = img_lsn + 1 == end_lsn; + if image_is_newer || image_exact_match { + Some(SearchResult { + layer: ReadableLayerWeak::PersistentLayer(image), lsn_floor: img_lsn, }) } else { let lsn_floor = - std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1); Some(SearchResult { - layer: delta, + layer: ReadableLayerWeak::InMemoryLayer(inmem), lsn_floor, }) } } + (Some(delta), None, Some(inmem)) => { + // Overlaps between delta and in-memory layers are not a valid + // state, but we handle them here for completeness. + let delta_end = delta.get_lsn_range().end; + let delta_is_newer = delta_end >= inmem.get_lsn_range().end; + let delta_exact_match = delta_end == end_lsn; + if delta_is_newer || delta_exact_match { + Some(SearchResult { + lsn_floor: delta.get_lsn_range().start, + layer: ReadableLayerWeak::PersistentLayer(delta), + }) + } else { + // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial + // up to the delta layer. + let lsn_floor = + std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end); + Some(SearchResult { + layer: ReadableLayerWeak::InMemoryLayer(inmem), + lsn_floor, + }) + } + } + (Some(delta), Some(image), Some(inmem)) => { + // Determine the preferred persistent layer without taking the in-memory layer + // into consideration. + let persistent_res = + Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn) + .unwrap(); + let persistent_l = match persistent_res.layer { + ReadableLayerWeak::PersistentLayer(l) => l, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(), + }; + + // Now handle the in-memory layer overlaps. + let inmem_res = if persistent_l.is_delta() { + Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn) + .unwrap() + } else { + Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn) + .unwrap() + }; + + Some(SearchResult { + layer: inmem_res.layer, + // Use the more restrictive LSN floor + lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor), + }) + } } } pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let in_memory_layer = self.search_in_memory_layer(end_lsn); + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { Some(version) => version, None => { - let mut result = RangeSearchResult::new(); - result.not_found.add_range(key_range); - return result; + return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range); } }; @@ -464,7 +606,13 @@ impl LayerMap { let delta_changes = version.delta_coverage.range_overlaps(&raw_range); let image_changes = version.image_coverage.range_overlaps(&raw_range); - let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + let collector = RangeSearchCollector::new( + key_range, + end_lsn, + in_memory_layer, + delta_changes, + image_changes, + ); collector.collect() } @@ -571,17 +719,36 @@ impl LayerMap { } /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> - where - Pred: FnMut(&Arc) -> bool, - { + pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option { + let is_below = |l: &Arc| { + let start_lsn = l.get_lsn_range().start; + below > start_lsn + }; + if let Some(open) = &self.open_layer { - if pred(open) { - return Some(open.clone()); + if is_below(open) { + return Some(InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: open.get_lsn_range(), + }); } } - self.frozen_layers.iter().rfind(|l| pred(l)).cloned() + self.frozen_layers + .iter() + .enumerate() + .rfind(|(_idx, l)| is_below(l)) + .map(|(idx, l)| InMemoryLayerDesc { + handle: InMemoryLayerHandle::Frozen(idx), + lsn_range: l.get_lsn_range(), + }) + } + + pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc { + match desc.handle { + InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(), + InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(), + } } /// @@ -737,136 +904,6 @@ impl LayerMap { max_stacked_deltas } - /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. - /// - /// The `partition_range` argument is used as context for the reimage-worthiness decision. - /// - /// Used as a helper for correctness checks only. Performance not critical. - pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range) -> usize { - match self.search(key, lsn) { - Some(search_result) => { - if search_result.layer.is_incremental() { - (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize) - + self.get_difficulty(search_result.lsn_floor, key, partition_range) - } else { - 0 - } - } - None => 0, - } - } - - /// Used for correctness checking. Results are expected to be identical to - /// self.get_difficulty_map. Assumes self.search is correct. - pub fn get_difficulty_map_bruteforce( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - ) -> Vec { - // Looking at the difficulty as a function of key, it could only increase - // when a delta layer starts or an image layer ends. Therefore it's sufficient - // to check the difficulties at: - // - the key.start for each non-empty part range - // - the key.start for each delta - // - the key.end for each image - let keys_iter: Box> = { - let mut keys: Vec = self - .iter_historic_layers() - .map(|layer| { - if layer.is_incremental() { - layer.get_key_range().start - } else { - layer.get_key_range().end - } - }) - .collect(); - keys.sort(); - Box::new(keys.into_iter()) - }; - let mut keys_iter = keys_iter.peekable(); - - // Iter the partition and keys together and query all the necessary - // keys, computing the max difficulty for each part. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - // Partition ranges are assumed to be sorted and disjoint - // TODO assert it - for range in &part.ranges { - if !range.is_empty() { - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range)); - } - while let Some(key) = keys_iter.peek() { - if key >= &range.end { - break; - } - let key = keys_iter.next().unwrap(); - if key < range.start { - continue; - } - difficulty = - std::cmp::max(difficulty, self.get_difficulty(lsn, key, range)); - } - } - difficulty - }) - .collect() - } - - /// For each part of a keyspace partitioning, return the maximum number of layers - /// that would be needed for page reconstruction in that part at the given LSN. - /// - /// If `limit` is provided we don't try to count above that number. - /// - /// This method is used to decide where to create new image layers. Computing the - /// result for the entire partitioning at once allows this function to be more - /// efficient, and further optimization is possible by using iterators instead, - /// to allow early return. - /// - /// TODO actually use this method instead of count_deltas. Currently we only use - /// it for benchmarks. - pub fn get_difficulty_map( - &self, - lsn: Lsn, - partitioning: &KeyPartitioning, - limit: Option, - ) -> Vec { - // TODO This is a naive implementation. Perf improvements to do: - // 1. Instead of calling self.image_coverage and self.count_deltas, - // iterate the image and delta coverage only once. - partitioning - .parts - .iter() - .map(|part| { - let mut difficulty = 0; - for range in &part.ranges { - if limit == Some(difficulty) { - break; - } - for (img_range, last_img) in self.image_coverage(range, lsn) { - if limit == Some(difficulty) { - break; - } - let img_lsn = if let Some(last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - - if img_lsn < lsn { - let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit); - difficulty = std::cmp::max(difficulty, num_deltas); - } - } - } - difficulty - }) - .collect() - } - /// Return all L0 delta layers pub fn level0_deltas(&self) -> &Vec> { &self.l0_delta_layers @@ -1069,6 +1106,10 @@ mod tests { use std::collections::HashMap; use std::path::PathBuf; + use crate::{ + DEFAULT_PG_VERSION, + tenant::{harness::TenantHarness, storage_layer::LayerName}, + }; use pageserver_api::key::DBDIR_KEY; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use utils::id::{TenantId, TimelineId}; @@ -1076,7 +1117,6 @@ mod tests { use super::*; use crate::tenant::IndexPart; - use crate::tenant::storage_layer::LayerName; #[derive(Clone)] struct LayerDesc { @@ -1101,7 +1141,6 @@ mod tests { } fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { - assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); let lhs: HashMap = lhs .found .into_iter() @@ -1127,17 +1166,12 @@ mod tests { let mut key = key_range.start; while key != key_range.end { let res = layer_map.search(key, end_lsn); - match res { - Some(res) => { - range_search_result - .found - .entry(res) - .or_default() - .add_key(key); - } - None => { - range_search_result.not_found.add_key(key); - } + if let Some(res) = res { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); } key = key.next(); @@ -1152,20 +1186,49 @@ mod tests { let range = Key::from_i128(100)..Key::from_i128(200); let res = layer_map.range_search(range.clone(), Lsn(100)); - assert_eq!( - res.not_found.to_keyspace(), - KeySpace { - ranges: vec![range] - } - ); + assert_range_search_result_eq(res, RangeSearchResult::new()); } - #[test] - fn ranged_search() { + #[tokio::test] + async fn ranged_search() { + let harness = TenantHarness::create("ranged_search").await.unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline_id = TimelineId::generate(); + // Create the timeline such that the in-memory layers can be written + // to the timeline directory. + tenant + .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let gate = utils::sync::gate::Gate::default(); + let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range| { + let layer = InMemoryLayer::create( + harness.conf, + timeline_id, + harness.tenant_shard_id, + lsn_range.start, + &gate, + &ctx, + ) + .await + .unwrap(); + + layer.freeze(lsn_range.end).await; + + layer_map.frozen_layers.push_back(Arc::new(layer)); + }; + + let in_memory_layer_configurations = [ + vec![], + // Overlaps with the top-most image + vec![Lsn(35)..Lsn(50)], + ]; + let layers = vec![ LayerDesc { key_range: Key::from_i128(15)..Key::from_i128(50), - lsn_range: Lsn(0)..Lsn(5), + lsn_range: Lsn(5)..Lsn(6), is_delta: false, }, LayerDesc { @@ -1185,19 +1248,27 @@ mod tests { }, LayerDesc { key_range: Key::from_i128(35)..Key::from_i128(40), - lsn_range: Lsn(35)..Lsn(40), + lsn_range: Lsn(40)..Lsn(41), is_delta: false, }, ]; - let layer_map = create_layer_map(layers.clone()); - for start in 0..60 { - for end in (start + 1)..60 { - let range = Key::from_i128(start)..Key::from_i128(end); - let result = layer_map.range_search(range.clone(), Lsn(100)); - let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + let mut layer_map = create_layer_map(layers.clone()); + for in_memory_layers in in_memory_layer_configurations { + for in_mem_layer_range in in_memory_layers { + add_in_memory_layer(&mut layer_map, in_mem_layer_range).await; + } - assert_range_search_result_eq(result, expected); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + eprintln!("{start}..{end}: {result:?}"); + + assert_range_search_result_eq(result, expected); + } } } } @@ -1490,12 +1561,348 @@ mod tests { // Sanity: the layer that holds latest data for the DBDIR key should always be visible // (just using this key as a key that will always exist for any layermap fixture) - let dbdir_layer = layer_map - .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) - .unwrap(); + let dbdir_layer = { + let readable_layer = layer_map + .search(DBDIR_KEY, index.metadata.disk_consistent_lsn()) + .unwrap(); + + match readable_layer.layer { + ReadableLayerWeak::PersistentLayer(desc) => desc, + ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""), + } + }; assert!(matches!( - layer_visibilities.get(&dbdir_layer.layer).unwrap(), + layer_visibilities.get(&dbdir_layer).unwrap(), LayerVisibilityHint::Visible )); } } + +#[cfg(test)] +mod select_layer_tests { + use super::*; + + fn create_persistent_layer( + start_lsn: u64, + end_lsn: u64, + is_delta: bool, + ) -> Arc { + if !is_delta { + assert_eq!(end_lsn, start_lsn + 1); + } + + Arc::new(PersistentLayerDesc::new_test( + Key::MIN..Key::MAX, + Lsn(start_lsn)..Lsn(end_lsn), + is_delta, + )) + } + + fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc { + InMemoryLayerDesc { + handle: InMemoryLayerHandle::Open, + lsn_range: Lsn(start_lsn)..Lsn(end_lsn), + } + } + + #[test] + fn test_select_layer_empty() { + assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none()); + } + + #[test] + fn test_select_layer_only_delta() { + let delta = create_persistent_layer(10, 20, true); + let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_only_image() { + let image = create_persistent_layer(10, 11, false); + let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_only_inmem() { + let inmem = create_inmem_layer(10, 20); + let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_image_inside_delta() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(15, 16, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_newer_image() { + let delta = create_persistent_layer(10, 20, true); + let image = create_persistent_layer(25, 26, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + } + + #[test] + fn test_select_layer_delta_with_older_image() { + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(10, 11, false); + + let result = + LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = + LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap(); + + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_image_inside_inmem() { + let image = create_persistent_layer(15, 16, false); + let inmem = create_inmem_layer(10, 25); + + let result = + LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(16)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + None, + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + + let result = + LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + } + + #[test] + fn test_select_layer_delta_inside_inmem() { + let delta_top = create_persistent_layer(15, 20, true); + let delta_bottom = create_persistent_layer(10, 15, true); + let inmem = create_inmem_layer(15, 25); + + let result = + LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30)) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta_top.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(15)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top)) + ); + + let result = LayerMap::select_layer( + Some(delta_bottom.clone()), + None, + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + assert_eq!(result.lsn_floor, Lsn(10)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom)) + ); + } + + #[test] + fn test_select_layer_all_overlap_1() { + let inmem = create_inmem_layer(10, 30); + let delta = create_persistent_layer(15, 25, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_2() { + let inmem = create_inmem_layer(20, 30); + let delta = create_persistent_layer(10, 40, true); + let image = create_persistent_layer(25, 26, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(26)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(25)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } + + #[test] + fn test_select_layer_all_overlap_3() { + let inmem = create_inmem_layer(30, 40); + let delta = create_persistent_layer(10, 30, true); + let image = create_persistent_layer(20, 21, false); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + Some(inmem.clone()), + Lsn(50), + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(30)); + assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem)); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(21)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta)) + ); + + let result = LayerMap::select_layer( + Some(delta.clone()), + Some(image.clone()), + None, + result.lsn_floor, + ) + .unwrap(); + + assert_eq!(result.lsn_floor, Lsn(20)); + assert!( + matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image)) + ); + } +} diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index f8bec48886..b3dc8e56a3 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage { /// The latest state head: LayerCoverageTuple, + /// TODO: this could be an ordered vec using binary search. + /// We push into this map everytime we add a layer, so might see some benefit /// All previous states historic: BTreeMap>, } @@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage { buffer: BTreeMap>, /// All current layers. This is not used for search. Only to make rebuilds easier. + // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of + // [`Self::historic_coverage`] instead of doubling memory usage. + // [`Self::len`]: can require rebuild and serve from latest historic + // [`Self::iter`]: already requires rebuild => can serve from latest historic layers: BTreeMap, } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 4ba5844fea..891760b499 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -194,7 +194,7 @@ pub(crate) use download::{ }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; -use pageserver_api::models::TimelineArchivalState; +use pageserver_api::models::{RelSizeMigration, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use regex::Regex; use remote_storage::{ @@ -437,9 +437,13 @@ impl RemoteTimelineClient { /// Initialize the upload queue for the case where the remote storage is empty, /// i.e., it doesn't have an `IndexPart`. + /// + /// `rel_size_v2_status` needs to be carried over during branching, and that's why + /// it's passed in here. pub fn init_upload_queue_for_empty_remote( &self, local_metadata: &TimelineMetadata, + rel_size_v2_status: Option, ) -> anyhow::Result<()> { // Set the maximum number of inprogress tasks to the remote storage concurrency. There's // certainly no point in starting more upload tasks than this. @@ -449,7 +453,9 @@ impl RemoteTimelineClient { .as_ref() .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + let initialized_queue = + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; + initialized_queue.dirty.rel_size_migration = rel_size_v2_status; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -900,7 +906,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, gc_compaction_state: GcCompactionState, @@ -912,6 +918,21 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field. + pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update( + self: &Arc, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status); + // TODO: allow this operation to bypass the validation check because we might upload the index part + // with no layers but the flag updated. For now, we just modify the index part in memory and the next + // upload will include the flag. + // self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -933,6 +954,14 @@ impl RemoteTimelineClient { Ok(()) } + /// Only used in the `patch_index_part` HTTP API to force trigger an index upload. + pub fn force_schedule_index_upload(self: &Arc) -> Result<(), NotInitialized> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + self.schedule_index_upload(upload_queue); + Ok(()) + } + /// Launch an index-file upload operation in the background (internal function) fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index ceaed58bbd..16c38be907 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,6 +7,7 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::RelSizeMigration; use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; @@ -117,21 +118,6 @@ pub struct GcCompactionState { pub(crate) last_completed_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub enum RelSizeMigration { - /// The tenant is using the old rel_size format. - /// Note that this enum is persisted as `Option` in the index part, so - /// `None` is the same as `Some(RelSizeMigration::Legacy)`. - Legacy, - /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are - /// persisted in the index part. The read path will read both formats and merge them. - Migrating, - /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted - /// in the index part, and the read path will not read the old format. - Migrated, -} - impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index a13b9323ac..1cf0241631 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -491,7 +491,10 @@ impl JobGenerator TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { + let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id); let timeline_state = timeline_states .remove(&timeline.timeline_id) .expect("Just populated above"); @@ -869,8 +873,7 @@ impl<'a> TenantDownloader<'a> { let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); let layers_in_heatmap = heatmap_timeline - .layers - .iter() + .hot_layers() .map(|l| (&l.name, l.metadata.generation)) .collect::>(); let layers_on_disk = timeline_state @@ -1015,7 +1018,8 @@ impl<'a> TenantDownloader<'a> { // Accumulate updates to the state let mut touched = Vec::new(); - for layer in timeline.layers { + let timeline_id = timeline.timeline_id; + for layer in timeline.into_hot_layers() { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); return (Err(UpdateError::Cancelled), touched); @@ -1040,7 +1044,7 @@ impl<'a> TenantDownloader<'a> { } match self - .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .download_layer(tenant_shard_id, &timeline_id, layer, ctx) .await { Ok(Some(layer)) => touched.push(layer), @@ -1148,7 +1152,7 @@ impl<'a> TenantDownloader<'a> { let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_id = timeline.timeline_id; - tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) @@ -1316,11 +1320,11 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = - heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + heatmap.hot_layers().map(|l| (&l.name, l)).collect(); let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = if let Some(last_heatmap) = last_heatmap { - last_heatmap.layers.iter().map(|l| (&l.name, l)).collect() + last_heatmap.hot_layers().map(|l| (&l.name, l)).collect() } else { HashMap::new() }; diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 4a938e9095..6dbb3f091f 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, - pub(crate) layers: Vec, + layers: Vec, } #[serde_as] @@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer { #[serde_as(as = "TimestampSeconds")] pub(crate) access_time: SystemTime, - // TODO: an actual 'heat' score that would let secondary locations prioritize downloading - // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. + + #[serde(default)] + pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading + // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } impl HeatMapLayer { @@ -62,11 +64,13 @@ impl HeatMapLayer { name: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + cold: bool, ) -> Self { Self { name, metadata, access_time, + cold, } } } @@ -78,6 +82,18 @@ impl HeatMapTimeline { layers, } } + + pub(crate) fn into_hot_layers(self) -> impl Iterator { + self.layers.into_iter().filter(|l| !l.cold) + } + + pub(crate) fn hot_layers(&self) -> impl Iterator { + self.layers.iter().filter(|l| !l.cold) + } + + pub(crate) fn all_layers(&self) -> impl Iterator { + self.layers.iter() + } } pub(crate) struct HeatMapStats { @@ -92,7 +108,7 @@ impl HeatMapTenant { layers: 0, }; for timeline in &self.timelines { - for layer in &timeline.layers { + for layer in timeline.hot_layers() { stats.layers += 1; stats.bytes += layer.metadata.file_size; } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index ed6b351c75..8cc94b4e4d 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -474,7 +474,7 @@ async fn fill_logical_sizes( if cached_size.is_none() { let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); - let ctx = ctx.attached_child(); + let ctx = ctx.attached_child().with_scope_timeline(&timeline); joinset.spawn( calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx) .in_current_span(), diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 7f313f46a2..ece163b24a 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; +use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; use crate::config::PageServerConf; use crate::context::{AccessStatsBehavior, RequestContext}; @@ -721,6 +722,12 @@ struct LayerToVisitId { lsn_floor: Lsn, } +#[derive(Debug, PartialEq, Eq, Hash)] +pub enum ReadableLayerWeak { + PersistentLayer(Arc), + InMemoryLayer(InMemoryLayerDesc), +} + /// Layer wrapper for the read path. Note that it is valid /// to use these layers even after external operations have /// been performed on them (compaction, freeze, etc.). @@ -873,7 +880,7 @@ impl ReadableLayer { } ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index b061bfab34..1d6e57fda5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1385,7 +1385,7 @@ impl DeltaLayerInner { block_reader, ); - tree_reader.dump().await?; + tree_reader.dump(ctx).await?; let keys = self.index_entries(ctx).await?; @@ -2024,6 +2024,7 @@ pub(crate) mod test { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); let initdb_layer = timeline .layers @@ -2136,7 +2137,7 @@ pub(crate) mod test { .await .unwrap(); - let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap(); new_layer .copy_delta_prefix(&mut writer, truncate_at, ctx) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 4d66843718..aa0f3fbff6 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -208,7 +208,7 @@ impl ImageLayerInner { block_reader, ); - tree_reader.dump().await?; + tree_reader.dump(ctx).await?; tree_reader .visit( diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 54c82914d5..f3a45a4976 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -416,7 +416,7 @@ impl InMemoryLayer { pub(crate) async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { @@ -433,8 +433,6 @@ impl InMemoryLayer { let mut reads: HashMap> = HashMap::new(); let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); - let lsn_range = self.start_lsn..end_lsn; - for range in keyspace.ranges.iter() { for (key, vec_map) in inner .index diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index ae06aca63b..247092bf45 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -324,16 +324,16 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let downloaded = self - .0 - .get_or_maybe_download(true, Some(ctx)) - .await - .map_err(|err| match err { - DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { - GetVectoredError::Cancelled - } - other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; + let downloaded = + self.0 + .get_or_maybe_download(true, ctx) + .await + .map_err(|err| match err { + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), @@ -356,8 +356,8 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> Result<(), DownloadError> { - self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> { + self.0.get_or_maybe_download(true, ctx).await?; Ok(()) } @@ -392,8 +392,11 @@ impl Layer { } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. - pub(crate) async fn download_and_keep_resident(&self) -> Result { - let downloaded = self.0.get_or_maybe_download(true, None).await?; + pub(crate) async fn download_and_keep_resident( + &self, + ctx: &RequestContext, + ) -> Result { + let downloaded = self.0.get_or_maybe_download(true, ctx).await?; Ok(ResidentLayer { downloaded, @@ -446,7 +449,7 @@ impl Layer { if verbose { // for now, unconditionally download everything, even if that might not be wanted. - let l = self.0.get_or_maybe_download(true, Some(ctx)).await?; + let l = self.0.get_or_maybe_download(true, ctx).await?; l.dump(&self.0, ctx).await? } @@ -945,7 +948,7 @@ impl LayerInner { async fn get_or_maybe_download( self: &Arc, allow_download: bool, - ctx: Option<&RequestContext>, + ctx: &RequestContext, ) -> Result, DownloadError> { let (weak, permit) = { // get_or_init_detached can: @@ -1035,21 +1038,14 @@ impl LayerInner { return Err(DownloadError::NotFile(ft)); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + self.check_expected_download(ctx)?; if !allow_download { // this is only used from tests, but it is hard to test without the boolean return Err(DownloadError::DownloadRequired); } - let download_ctx = ctx - .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) - .unwrap_or(RequestContext::new( - TaskKind::LayerDownload, - DownloadBehavior::Download, - )); + let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1567,10 +1563,10 @@ impl LayerInner { self.access_stats.record_residence_event(); - self.status.as_ref().unwrap().send_replace(Status::Evicted); - *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); + self.status.as_ref().unwrap().send_replace(Status::Evicted); + Ok(()) } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 724150d27f..7086429bfe 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -8,7 +8,6 @@ use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; use crate::context::DownloadBehavior; -use crate::task_mgr::TaskKind; use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; @@ -27,11 +26,9 @@ async fn smoke_test() { let h = TenantHarness::create("smoke_test").await.unwrap(); let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); - let (tenant, _) = h.load().await; + let (tenant, ctx) = h.load().await; let io_concurrency = IoConcurrency::spawn_for_test(); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); - let image_layers = vec![( Lsn(0x40), vec![( @@ -49,12 +46,14 @@ async fn smoke_test() { Lsn(0x10), 14, &ctx, + Default::default(), // in-memory layers Default::default(), image_layers, Lsn(0x100), ) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); // Grab one of the timeline's layers to exercise in the test, and the other layer that is just // there to avoid the timeline being illegally empty @@ -93,7 +92,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - &ctx, + ctx, ) .await .unwrap(); @@ -128,7 +127,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - &ctx, + ctx, ) .instrument(download_span.clone()) .await @@ -178,7 +177,7 @@ async fn smoke_test() { // plain downloading is rarely needed layer - .download_and_keep_resident() + .download_and_keep_resident(ctx) .instrument(download_span) .await .unwrap(); @@ -340,6 +339,7 @@ fn read_wins_pending_eviction() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { @@ -379,7 +379,7 @@ fn read_wins_pending_eviction() { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -472,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); let layer = { let mut layers = { @@ -514,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .instrument(download_span) .await .expect("should had reinitialized without downloading"); @@ -641,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -674,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // simulate a cancelled read which is cancelled before it gets to re-initialize let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!( @@ -698,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { // failpoint is still enabled, but it is not hit let e = layer .0 - .get_or_maybe_download(false, None) + .get_or_maybe_download(false, &ctx) .await .unwrap_err(); assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); @@ -721,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() { .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); + let ctx = ctx.with_scope_timeline(&timeline); + + // This test does downloads + let ctx = RequestContextBuilder::extend(&ctx) + .download_behavior(DownloadBehavior::Download) + .build(); let layer = { let mut layers = { @@ -768,7 +780,7 @@ async fn evict_and_wait_does_not_wait_for_download() { let mut download = std::pin::pin!( layer .0 - .get_or_maybe_download(true, None) + .get_or_maybe_download(true, &ctx) .instrument(download_span) ); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 670f9ad87f..589ac5ae88 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -289,15 +289,14 @@ fn log_compaction_error( ) { use CompactionError::*; - use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::tenant::PageReconstructError; use crate::tenant::upload_queue::NotInitialized; let level = match err { + e if e.is_cancel() => return, ShuttingDown => return, Offload(_) => Level::ERROR, AlreadyRunning(_) => Level::ERROR, - CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { @@ -474,21 +473,15 @@ async fn wait_for_active_tenant( } let mut update_rx = tenant.subscribe_for_state_updates(); - loop { - tokio::select! { - _ = cancel.cancelled() => return ControlFlow::Break(()), - result = update_rx.changed() => if result.is_err() { + tokio::select! { + result = update_rx.wait_for(|s| s == &TenantState::Active) => { + if result.is_err() { return ControlFlow::Break(()); } - } - - match &*update_rx.borrow() { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => debug!("Not running the task loop, tenant is not active: {state:?}"), - } + debug!("Tenant state changed to active, continuing the task loop"); + ControlFlow::Continue(()) + }, + _ = cancel.cancelled() => ControlFlow::Break(()), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index abcce23d83..c78c17c9bb 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; @@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; use crate::metrics::{ - DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL, + LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; use crate::page_service::TenantManagerTypes; use crate::pgdatadir_mapping::{ @@ -286,7 +287,7 @@ pub struct Timeline { // The LSN of gc-compaction that was last applied to this timeline. gc_compaction_state: ArcSwap>, - pub(super) metrics: TimelineMetrics, + pub(crate) metrics: Arc, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code // in `crate::page_service` writes these metrics. @@ -436,12 +437,16 @@ pub struct Timeline { /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. heatmap_layers_downloader: Mutex>, + + pub(crate) rel_size_v2_status: ArcSwapOption, } pub(crate) enum PreviousHeatmap { Active { heatmap: HeatMapTimeline, read_at: std::time::Instant, + // End LSN covered by the heatmap if known + end_lsn: Option, }, Obsolete, } @@ -1326,10 +1331,6 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - // Record the total number of layers visited towards each key in the batch. While some - // layers may not intersect with a given read, and the cost of layer visits are - // amortized across the batch, each visited layer contributes directly to the observed - // latency for every read in the batch, which is what we care about. if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); @@ -1344,9 +1345,23 @@ impl Timeline { }); } + // Records the number of layers visited in a few different ways: + // + // * LAYERS_PER_READ: all layers count towards every read in the batch, because each + // layer directly affects its observed latency. + // + // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch + // layer visits and access cost. + // + // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized + // read amplification after batching. + let layers_visited = layers_visited as f64; + let avg_layers_visited = layers_visited / results.len() as f64; + LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited); for _ in &results { - self.metrics.layers_per_read.observe(layers_visited as f64); - LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + self.metrics.layers_per_read.observe(layers_visited); + LAYERS_PER_READ_GLOBAL.observe(layers_visited); + LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited); } } @@ -1864,16 +1879,25 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. - match result { + match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { + Err(e) if e.is_cancel() => {} + Err(CompactionError::ShuttingDown) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::AlreadyRunning(_)) => { + // Covered by the `Err(e) if e.is_cancel()` branch. + } + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + Err(CompactionError::CollectKeySpaceError(_)) => { + // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to // abruptly stall nor resume L0 flushes in these cases. Err(CompactionError::Offload(_)) => {} - Err(CompactionError::ShuttingDown) => {} - Err(CompactionError::AlreadyRunning(_)) => {} }; result @@ -2188,6 +2212,7 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, + ctx: &RequestContext, ) -> Result, super::storage_layer::layer::DownloadError> { let Some(layer) = self .find_layer(layer_file_name) @@ -2201,7 +2226,7 @@ impl Timeline { return Ok(None); }; - layer.download().await?; + layer.download(ctx).await?; Ok(Some(true)) } @@ -2356,6 +2381,9 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path + /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is + /// possible that the index part persists the state while the config doesn't get persisted. pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2364,6 +2392,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } + pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration { + self.rel_size_v2_status + .load() + .as_ref() + .map(|s| s.as_ref().clone()) + .unwrap_or(RelSizeMigration::Legacy) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2624,6 +2660,7 @@ impl Timeline { attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, gc_compaction_state: Option, + rel_size_v2_status: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2648,14 +2685,14 @@ impl Timeline { } Arc::new_cyclic(|myself| { - let metrics = TimelineMetrics::new( + let metrics = Arc::new(TimelineMetrics::new( &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", evictions_low_residence_duration_metric_threshold, ), - ); + )); let aux_file_metrics = metrics.aux_file_size_gauge.clone(); let mut result = Timeline { @@ -2782,6 +2819,8 @@ impl Timeline { previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), heatmap_layers_downloader: Mutex::new(None), + + rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), }; result.repartition_threshold = @@ -2837,7 +2876,7 @@ impl Timeline { "layer flush task", async move { let _guard = guard; - let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); + let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); @@ -2858,6 +2897,16 @@ impl Timeline { .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) } + pub(crate) fn update_rel_size_v2_status( + &self, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + self.rel_size_v2_status + .store(Some(Arc::new(rel_size_v2_status.clone()))); + self.remote_client + .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status) + } + pub(crate) fn get_gc_compaction_state(&self) -> Option { self.gc_compaction_state.load_full().as_ref().clone() } @@ -3560,12 +3609,16 @@ impl Timeline { Ok(layer) } - pub(super) fn is_previous_heatmap_active(&self) -> bool { - self.previous_heatmap - .load() - .as_ref() - .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) - .unwrap_or(false) + pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool { + let crnt = self.previous_heatmap.load(); + match crnt.as_deref() { + Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn { + Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn, + None => true, + }, + Some(PreviousHeatmap::Obsolete) => false, + None => false, + } } /// The timeline heatmap is a hint to secondary locations from the primary location, @@ -3593,26 +3646,26 @@ impl Timeline { // heatamp. let previous_heatmap = self.previous_heatmap.load(); let visible_non_resident = match previous_heatmap.as_deref() { - Some(PreviousHeatmap::Active { heatmap, read_at }) => { - Some(heatmap.layers.iter().filter_map(|hl| { - let desc: PersistentLayerDesc = hl.name.clone().into(); - let layer = guard.try_get_from_key(&desc.key())?; + Some(PreviousHeatmap::Active { + heatmap, read_at, .. + }) => Some(heatmap.all_layers().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; - if layer.visibility() == LayerVisibilityHint::Covered { - return None; - } + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } - if layer.is_likely_resident() { - return None; - } + if layer.is_likely_resident() { + return None; + } - if layer.last_evicted_at().happened_after(*read_at) { - return None; - } + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } - Some((desc, hl.metadata.clone(), hl.access_time)) - })) - } + Some((desc, hl.metadata.clone(), hl.access_time, hl.cold)) + })), Some(PreviousHeatmap::Obsolete) => None, None => None, }; @@ -3627,6 +3680,7 @@ impl Timeline { layer.layer_desc().clone(), layer.metadata(), last_activity_ts, + false, // these layers are not cold )) } LayerVisibilityHint::Covered => { @@ -3653,12 +3707,14 @@ impl Timeline { // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes // or hours later: + // - Cold layers go last for convenience when a human inspects the heatmap. // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might // only exist for a few minutes before being compacted into L1s. // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner // the layer is likely to be covered by an image layer during compaction. - layers.sort_by_key(|(desc, _meta, _atime)| { + layers.sort_by_key(|(desc, _meta, _atime, cold)| { std::cmp::Reverse(( + *cold, !LayerMap::is_l0(&desc.key_range, desc.is_delta), desc.lsn_range.end, )) @@ -3666,7 +3722,9 @@ impl Timeline { let layers = layers .into_iter() - .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .map(|(desc, meta, atime, cold)| { + HeatMapLayer::new(desc.layer_name(), meta, atime, cold) + }) .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) @@ -3686,6 +3744,7 @@ impl Timeline { name: vl.layer_desc().layer_name(), metadata: vl.metadata(), access_time: now, + cold: true, }; heatmap_layers.push(hl); } @@ -3699,6 +3758,7 @@ impl Timeline { PreviousHeatmap::Active { heatmap, read_at: Instant::now(), + end_lsn: Some(end_lsn), } } @@ -3897,39 +3957,22 @@ impl Timeline { let guard = timeline.layers.read().await; let layers = guard.layer_map()?; - let in_memory_layer = layers.find_in_memory_layer(|l| { - let start_lsn = l.get_lsn_range().start; - cont_lsn > start_lsn - }); + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); - match in_memory_layer { - Some(l) => { - let lsn_range = l.get_lsn_range().start..cont_lsn; - fringe.update( - ReadableLayer::InMemoryLayer(l), - unmapped_keyspace.clone(), - lsn_range, - ); - } - None => { - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); - - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); - } - } + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + guard.upgrade(layer), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } // It's safe to drop the layer map lock after planning the next round of reads. @@ -4202,10 +4245,6 @@ impl Timeline { // Stall flushes to backpressure if compaction can't keep up. This is propagated up // to WAL ingestion by having ephemeral layer rolls wait for flushes. - // - // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so - // we can end up stalling before compaction even starts. Consider making it more - // responsive (e.g. via `watch_level0_deltas`). if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { if l0_count >= stall_threshold { warn!( @@ -4693,10 +4732,7 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self - .collect_keyspace(lsn, ctx) - .await - .map_err(CompactionError::CollectKeySpaceError)?; + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -5423,13 +5459,42 @@ pub(crate) enum CompactionError { Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(CollectKeySpaceError), + CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), #[error("Compaction already running: {0}")] AlreadyRunning(&'static str), } +impl CompactionError { + /// Errors that can be ignored, i.e., cancel and shutdown. + pub fn is_cancel(&self) -> bool { + matches!( + self, + Self::ShuttingDown + | Self::AlreadyRunning(_) + | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) + | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( + PageReconstructError::Cancelled + )) + | Self::Offload(OffloadError::Cancelled) + ) + } + + /// Critical errors that indicate data corruption. + pub fn is_critical(&self) -> bool { + matches!( + self, + Self::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ) + ) + ) + } +} + impl From for CompactionError { fn from(e: OffloadError) -> Self { match e { @@ -5439,18 +5504,6 @@ impl From for CompactionError { } } -impl From for CompactionError { - fn from(err: CollectKeySpaceError) -> Self { - match err { - CollectKeySpaceError::Cancelled - | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { - CompactionError::ShuttingDown - } - e => CompactionError::Other(e.into()), - } - } -} - impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { @@ -5534,6 +5587,14 @@ pub struct DeltaLayerTestDesc { pub data: Vec<(Key, Lsn, Value)>, } +#[cfg(test)] +#[derive(Clone)] +pub struct InMemoryLayerTestDesc { + pub lsn_range: Range, + pub data: Vec<(Key, Lsn, Value)>, + pub is_open: bool, +} + #[cfg(test)] impl DeltaLayerTestDesc { pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { @@ -6193,6 +6254,7 @@ impl Timeline { pub(crate) async fn spawn_download_all_remote_layers( self: Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) -> Result { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6213,6 +6275,10 @@ impl Timeline { } let self_clone = Arc::clone(&self); + let task_ctx = ctx.detached_child( + TaskKind::DownloadAllRemoteLayers, + DownloadBehavior::Download, + ); let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, @@ -6220,7 +6286,7 @@ impl Timeline { Some(self.timeline_id), "download all remote layers task", async move { - self_clone.download_all_remote_layers(request).await; + self_clone.download_all_remote_layers(request, &task_ctx).await; let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); match &mut *status_guard { None => { @@ -6255,6 +6321,7 @@ impl Timeline { async fn download_all_remote_layers( self: &Arc, request: DownloadRemoteLayersTaskSpawnRequest, + ctx: &RequestContext, ) { use pageserver_api::models::DownloadRemoteLayersTaskState; @@ -6311,9 +6378,10 @@ impl Timeline { let span = tracing::info_span!("download", layer = %next); + let ctx = ctx.attached_child(); js.spawn( async move { - let res = next.download().await; + let res = next.download(&ctx).await; (next, res) } .instrument(span), @@ -6541,6 +6609,92 @@ impl Timeline { Ok(()) } + /// Force create an in-memory layer and place them into the layer map. + #[cfg(test)] + pub(super) async fn force_create_in_memory_layer( + self: &Arc, + mut in_memory: InMemoryLayerTestDesc, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use utils::bin_ser::BeSer; + + // Validate LSNs + if let Some(check_start_lsn) = check_start_lsn { + assert!(in_memory.lsn_range.start >= check_start_lsn); + } + + let last_record_lsn = self.get_last_record_lsn(); + let layer_end_lsn = if in_memory.is_open { + in_memory + .data + .iter() + .map(|(_key, lsn, _value)| lsn) + .max() + .cloned() + } else { + Some(in_memory.lsn_range.end) + }; + + if let Some(end) = layer_end_lsn { + assert!( + end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + end, + last_record_lsn, + ); + } + + in_memory.data.iter().for_each(|(_key, lsn, _value)| { + assert!(*lsn >= in_memory.lsn_range.start); + assert!(*lsn < in_memory.lsn_range.end); + }); + + // Build the batch + in_memory + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + + let data = in_memory + .data + .into_iter() + .map(|(key, lsn, value)| { + let value_size = value.serialized_size().unwrap() as usize; + (key.to_compact(), lsn, value_size, value) + }) + .collect::>(); + + let batch = SerializedValueBatch::from_values(data); + + // Create the in-memory layer and write the batch into it + let layer = InMemoryLayer::create( + self.conf, + self.timeline_id, + self.tenant_shard_id, + in_memory.lsn_range.start, + &self.gate, + ctx, + ) + .await + .unwrap(); + + layer.put_batch(batch, ctx).await.unwrap(); + if !in_memory.is_open { + layer.freeze(in_memory.lsn_range.end).await; + } + + info!("force created in-memory layer {:?}", in_memory.lsn_range); + + // Link the layer to the layer map + { + let mut guard = self.layers.write().await; + let layer_map = guard.open_mut().unwrap(); + layer_map.force_insert_in_memory_layer(Arc::new(layer)); + } + + Ok(()) + } + /// Return all keys at the LSN in the image layers #[cfg(test)] pub(crate) async fn inspect_image_layers( @@ -6900,11 +7054,13 @@ mod tests { use pageserver_api::key::Key; use pageserver_api::value::Value; + use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; use super::HeatMapTimeline; + use crate::context::RequestContextBuilder; use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::layer_map::LayerMap; use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; @@ -6912,8 +7068,8 @@ mod tests { use crate::tenant::{PreviousHeatmap, Timeline}; fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { - assert_eq!(lhs.layers.len(), rhs.layers.len()); - let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + assert_eq!(lhs.all_layers().count(), rhs.all_layers().count()); + let lhs_rhs = lhs.all_layers().zip(rhs.all_layers()); for (l, r) in lhs_rhs { assert_eq!(l.name, r.name); assert_eq!(l.metadata, r.metadata); @@ -6972,12 +7128,14 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), ) .await .unwrap(); + let ctx = &ctx.with_scope_timeline(&timeline); // Layer visibility is an input to heatmap generation, so refresh it first timeline.update_layer_visibility().await.unwrap(); @@ -6990,10 +7148,11 @@ mod tests { assert_eq!(heatmap.timeline_id, timeline.timeline_id); // L0 should come last - assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + let heatmap_layers = heatmap.all_layers().collect::>(); + assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in &heatmap.layers { + for layer in heatmap_layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -7026,6 +7185,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Generate a new heatmap and assert that it contains the same layers as the old one. @@ -7041,8 +7201,12 @@ mod tests { eprintln!("Downloading {layer} and re-generating heatmap"); + let ctx = &RequestContextBuilder::extend(ctx) + .download_behavior(crate::context::DownloadBehavior::Download) + .build(); + let _resident = layer - .download_and_keep_resident() + .download_and_keep_resident(ctx) .instrument(tracing::info_span!( parent: None, "download_layer", @@ -7100,6 +7264,7 @@ mod tests { Lsn(0x10), 14, &ctx, + Vec::new(), // in-memory layers delta_layers, image_layers, Lsn(0x100), @@ -7116,7 +7281,7 @@ mod tests { .expect("Infallible while timeline is not shut down"); // Both layers should be in the heatmap - assert!(!heatmap.layers.is_empty()); + assert!(heatmap.all_layers().count() > 0); // Now simulate a migration. timeline @@ -7124,6 +7289,7 @@ mod tests { .store(Some(Arc::new(PreviousHeatmap::Active { heatmap: heatmap.clone(), read_at: std::time::Instant::now(), + end_lsn: None, }))); // Evict all the layers in the previous heatmap @@ -7141,7 +7307,7 @@ mod tests { .await .expect("Infallible while timeline is not shut down"); - assert!(post_eviction_heatmap.layers.is_empty()); + assert_eq!(post_eviction_heatmap.all_layers().count(), 0); assert!(matches!( timeline.previous_heatmap.load().as_deref(), Some(PreviousHeatmap::Obsolete) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 1f746930d5..87e0d02773 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,11 +7,20 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; +use std::time::Instant; -use anyhow::{Context, anyhow, bail}; +use super::layer_manager::LayerManager; +use super::{ + CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, + GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, + Timeline, +}; + +use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; @@ -31,15 +40,8 @@ use utils::critical; use utils::id::TimelineId; use utils::lsn::Lsn; -use super::layer_manager::LayerManager; -use super::{ - CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, - GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, - RecordedDuration, Timeline, -}; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; @@ -213,30 +215,39 @@ impl GcCompactionQueue { } /// Trigger an auto compaction. - pub async fn trigger_auto_compaction(&self, timeline: &Arc) { + pub async fn trigger_auto_compaction( + &self, + timeline: &Arc, + ) -> Result<(), CompactionError> { let GcCompactionCombinedSettings { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, } = timeline.get_gc_compaction_settings(); if !gc_compaction_enabled { - return; + return Ok(()); } if self.remaining_jobs_num() > 0 { // Only schedule auto compaction when the queue is empty - return; + return Ok(()); } if timeline.ancestor_timeline().is_some() { // Do not trigger auto compaction for child timelines. We haven't tested // it enough in staging yet. - return; + return Ok(()); + } + if timeline.get_gc_compaction_watermark() == Lsn::INVALID { + // If the gc watermark is not set, we don't need to trigger auto compaction. + // This check is the same as in `gc_compaction_split_jobs` but we don't log + // here and we can also skip the computation of the trigger condition earlier. + return Ok(()); } let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` // to ensure the fairness while avoid starving other tasks. - return; + return Ok(()); }; let gc_compaction_state = timeline.get_gc_compaction_state(); @@ -246,7 +257,7 @@ impl GcCompactionQueue { let layers = { let guard = timeline.layers.read().await; - let layer_map = guard.layer_map().unwrap(); + let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; let mut l2_size: u64 = 0; @@ -318,11 +329,12 @@ impl GcCompactionQueue { l1_size, l2_size, l2_lsn, gc_cutoff ); } else { - info!( + debug!( "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", l1_size, l2_size, l2_lsn, gc_cutoff ); } + Ok(()) } /// Notify the caller the job has finished and unblock GC. @@ -353,8 +365,7 @@ impl GcCompactionQueue { GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) - .await - .map_err(CompactionError::Other)?; + .await?; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); @@ -433,6 +444,7 @@ impl GcCompactionQueue { )); }; let has_pending_tasks; + let mut yield_for_l0 = false; let Some((id, item)) = ({ let mut guard = self.inner.lock().unwrap(); if let Some((id, item)) = guard.queued.pop_front() { @@ -444,7 +456,7 @@ impl GcCompactionQueue { None } }) else { - self.trigger_auto_compaction(timeline).await; + self.trigger_auto_compaction(timeline).await?; // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we // have not implemented preemption mechanism yet. We always want to yield it to more important // tasks if there is one. @@ -482,13 +494,23 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); } - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + let compaction_result = + timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); + if compaction_result == CompactionOutcome::YieldForL0 { + yield_for_l0 = true; + } } } GcCompactionQueueItem::SubCompactionJob(options) => { // TODO: error handling, clear the queue if any task fails? - let _ = timeline.compact_with_options(cancel, options, ctx).await?; + let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?; + if compaction_result == CompactionOutcome::YieldForL0 { + // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running + // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because + // we need to clean things up before returning from the function. + yield_for_l0 = true; + } } GcCompactionQueueItem::Notify(id, l2_lsn) => { self.notify_and_unblock(id); @@ -517,7 +539,10 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.running = None; } - Ok(if has_pending_tasks { + Ok(if yield_for_l0 { + tracing::info!("give up gc-compaction: yield for L0 compaction"); + CompactionOutcome::YieldForL0 + } else if has_pending_tasks { CompactionOutcome::Pending } else { CompactionOutcome::Done @@ -716,17 +741,41 @@ struct CompactionStatisticsNumSize { #[derive(Debug, Serialize, Default)] pub struct CompactionStatistics { + /// Delta layer visited (maybe compressed, physical size) delta_layer_visited: CompactionStatisticsNumSize, + /// Image layer visited (maybe compressed, physical size) image_layer_visited: CompactionStatisticsNumSize, + /// Delta layer produced (maybe compressed, physical size) delta_layer_produced: CompactionStatisticsNumSize, + /// Image layer produced (maybe compressed, physical size) image_layer_produced: CompactionStatisticsNumSize, - num_delta_layer_discarded: usize, - num_image_layer_discarded: usize, + /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + delta_layer_discarded: CompactionStatisticsNumSize, + /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer) + image_layer_discarded: CompactionStatisticsNumSize, num_unique_keys_visited: usize, + /// Delta visited (uncompressed, original size) wal_keys_visited: CompactionStatisticsNumSize, + /// Image visited (uncompressed, original size) image_keys_visited: CompactionStatisticsNumSize, + /// Delta produced (uncompressed, original size) wal_produced: CompactionStatisticsNumSize, + /// Image produced (uncompressed, original size) image_produced: CompactionStatisticsNumSize, + + // Time spent in each phase + time_acquire_lock_secs: f64, + time_analyze_secs: f64, + time_download_layer_secs: f64, + time_main_loop_secs: f64, + time_final_phase_secs: f64, + time_total_secs: f64, + + // Summary + /// Ratio of the key-value size before/after gc-compaction. + uncompressed_size_ratio: f64, + /// Ratio of the physical size before/after gc-compaction. + physical_size_ratio: f64, } impl CompactionStatistics { @@ -776,11 +825,13 @@ impl CompactionStatistics { self.image_produced.num += 1; self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64; } - fn discard_delta_layer(&mut self) { - self.num_delta_layer_discarded += 1; + fn discard_delta_layer(&mut self, original_size: u64) { + self.delta_layer_discarded.num += 1; + self.delta_layer_discarded.size += original_size; } - fn discard_image_layer(&mut self) { - self.num_image_layer_discarded += 1; + fn discard_image_layer(&mut self, original_size: u64) { + self.image_layer_discarded.num += 1; + self.image_layer_discarded.size += original_size; } fn produce_delta_layer(&mut self, size: u64) { self.delta_layer_produced.num += 1; @@ -790,6 +841,19 @@ impl CompactionStatistics { self.image_layer_produced.num += 1; self.image_layer_produced.size += size; } + fn finalize(&mut self) { + let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; + let produced_key_value_size = self.image_produced.size + self.wal_produced.size; + self.uncompressed_size_ratio = + original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; + let produced_physical_size = self.image_layer_produced.size + + self.delta_layer_produced.size + + self.image_layer_discarded.size + + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate + self.physical_size_ratio = + original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + } } #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] @@ -822,9 +886,7 @@ impl Timeline { .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) { - self.compact_with_gc(cancel, options, ctx) - .await - .map_err(CompactionError::Other)?; + self.compact_with_gc(cancel, options, ctx).await?; return Ok(CompactionOutcome::Done); } @@ -976,18 +1038,12 @@ impl Timeline { // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} - Err(CompactionError::ShuttingDown) => {} - Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {} + Err(err) if err.is_cancel() => {} // Alert on critical errors that indicate data corruption. - Err( - err @ CompactionError::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ), - ), - ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + Err(err) if err.is_critical() => { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); + } // Log other errors. No partitioning? This is normal, if the timeline was just created // as an empty timeline. Also in unit tests, when we use the timeline as a simple @@ -1161,7 +1217,7 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer.download_and_keep_resident().await?; + let resident = layer.download_and_keep_resident(ctx).await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) @@ -1389,14 +1445,14 @@ impl Timeline { let mut fully_compacted = true; - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push(l.download_and_keep_resident().await?); + deltas_to_compact.push(l.download_and_keep_resident(ctx).await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -2350,12 +2406,19 @@ impl Timeline { async fn check_compaction_space( self: &Arc, layer_selection: &[Layer], - ) -> anyhow::Result<()> { - let available_space = self.check_available_space().await?; + ) -> Result<(), CompactionError> { + let available_space = self + .check_available_space() + .await + .map_err(CompactionError::Other)?; let mut remote_layer_size = 0; let mut all_layer_size = 0; for layer in layer_selection { - let needs_download = layer.needs_download().await?; + let needs_download = layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)?; if needs_download.is_some() { remote_layer_size += layer.layer_desc().file_size; } @@ -2364,14 +2427,14 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!( + return Err(CompactionError::Other(anyhow!( "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size - )); + ))); } Ok(()) } @@ -2402,7 +2465,7 @@ impl Timeline { self: &Arc, job: GcCompactJob, sub_compaction_max_job_size_mb: Option, - ) -> anyhow::Result> { + ) -> Result, CompactionError> { let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { job.compact_lsn_range.end } else { @@ -2553,7 +2616,7 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); if sub_compaction { @@ -2575,7 +2638,7 @@ impl Timeline { if jobs_len == 0 { info!("no jobs to run, skipping gc bottom-most compaction"); } - return Ok(()); + return Ok(CompactionOutcome::Done); } self.compact_with_gc_inner(cancel, job, ctx).await } @@ -2585,19 +2648,24 @@ impl Timeline { cancel: &CancellationToken, job: GcCompactJob, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. + let timer = Instant::now(); + let begin_timer = timer; + let gc_lock = async { tokio::select! { guard = self.gc_lock.lock() => Ok(guard), - // TODO: refactor to CompactionError to correctly pass cancelled error - _ = cancel.cancelled() => Err(anyhow!("cancelled")), + _ = cancel.cancelled() => Err(CompactionError::ShuttingDown), } }; + let time_acquire_lock = timer.elapsed(); + let timer = Instant::now(); + let gc_lock = crate::timed( gc_lock, "acquires gc lock", @@ -2649,7 +2717,7 @@ impl Timeline { tracing::warn!( "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" ); - return Ok(()); + return Ok(CompactionOutcome::Skipped); } real_gc_cutoff } else { @@ -2687,7 +2755,7 @@ impl Timeline { "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff ); - return Ok(()); + return Ok(CompactionOutcome::Done); }; // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if @@ -2708,7 +2776,7 @@ impl Timeline { "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end ); - return Ok(()); + return Ok(CompactionOutcome::Done); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key // layers to compact. @@ -2734,7 +2802,7 @@ impl Timeline { "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end ); - return Ok(()); + return Ok(CompactionOutcome::Done); } retain_lsns_below_horizon.sort(); GcCompactionJobDescription { @@ -2787,6 +2855,9 @@ impl Timeline { has_data_below, ); + let time_analyze = timer.elapsed(); + let timer = Instant::now(); + for layer in &job_desc.selected_layers { debug!("read layer: {}", layer.layer_desc().key()); } @@ -2815,10 +2886,10 @@ impl Timeline { .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!( + return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err - ); + ))); } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc @@ -2833,11 +2904,33 @@ impl Timeline { let mut total_downloaded_size = 0; let mut total_layer_size = 0; for layer in &job_desc.selected_layers { - if layer.needs_download().await?.is_some() { + if layer + .needs_download() + .await + .context("failed to check if layer needs download") + .map_err(CompactionError::Other)? + .is_some() + { total_downloaded_size += layer.layer_desc().file_size; } total_layer_size += layer.layer_desc().file_size; - let resident_layer = layer.download_and_keep_resident().await?; + if cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); + } + let resident_layer = layer + .download_and_keep_resident(ctx) + .await + .context("failed to download and keep resident layer") + .map_err(CompactionError::Other)?; downloaded_layers.push(resident_layer); } info!( @@ -2848,19 +2941,36 @@ impl Timeline { ); for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { - let layer = resident_layer.get_as_delta(ctx).await?; + let layer = resident_layer + .get_as_delta(ctx) + .await + .context("failed to get delta layer") + .map_err(CompactionError::Other)?; delta_layers.push(layer); } else { - let layer = resident_layer.get_as_image(ctx).await?; + let layer = resident_layer + .get_as_image(ctx) + .await + .context("failed to get image layer") + .map_err(CompactionError::Other)?; image_layers.push(layer); } } - let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?; + let (dense_ks, sparse_ks) = self + .collect_gc_compaction_keyspace() + .await + .context("failed to collect gc compaction keyspace") + .map_err(CompactionError::Other)?; let mut merge_iter = FilterIterator::create( MergeIterator::create(&delta_layers, &image_layers, ctx), dense_ks, sparse_ks, - )?; + ) + .context("failed to create filter iterator") + .map_err(CompactionError::Other)?; + + let time_download_layer = timer.elapsed(); + let timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); @@ -2880,7 +2990,9 @@ impl Timeline { &self.gate, ctx, ) - .await?, + .await + .context("failed to create image layer writer") + .map_err(CompactionError::Other)?, ) } else { None @@ -2893,7 +3005,9 @@ impl Timeline { lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), ) - .await?; + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?; #[derive(Default)] struct RewritingLayers { @@ -2933,9 +3047,28 @@ impl Timeline { // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? { + let mut keys_processed = 0; + + while let Some(((key, lsn, val), desc)) = merge_iter + .next_with_trace() + .await + .context("failed to get next key-value pair") + .map_err(CompactionError::Other)? + { if cancel.is_cancelled() { - return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error + return Err(CompactionError::ShuttingDown); + } + keys_processed += 1; + if keys_processed % 1000 == 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction in the main loop: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); + } } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. @@ -2967,7 +3100,9 @@ impl Timeline { &self.gate, ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.before.as_mut().unwrap() @@ -2983,14 +3118,20 @@ impl Timeline { &self.gate, ctx, ) - .await?, + .await + .context("failed to create delta layer writer") + .map_err(CompactionError::Other)?, ); } rewriter.after.as_mut().unwrap() } else { unreachable!() }; - rewriter.put_value(key, lsn, val, ctx).await?; + rewriter + .put_value(key, lsn, val, ctx) + .await + .context("failed to put value") + .map_err(CompactionError::Other)?; continue; } match val { @@ -3013,9 +3154,13 @@ impl Timeline { &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn) - .await?, + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( *last_key, @@ -3025,7 +3170,9 @@ impl Timeline { &self.gate, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; accumulated_values.push((key, lsn, val)); @@ -3043,9 +3190,14 @@ impl Timeline { job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?, + get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn) + .await + .context("failed to get ancestor image") + .map_err(CompactionError::Other)?, ) - .await?; + .await + .context("failed to generate key retention") + .map_err(CompactionError::Other)?; retention .pipe_to( last_key, @@ -3055,21 +3207,36 @@ impl Timeline { &self.gate, ctx, ) - .await?; + .await + .context("failed to pipe to delta layer writer") + .map_err(CompactionError::Other)?; // end: move the above part to the loop body + let time_main_loop = timer.elapsed(); + let timer = Instant::now(); + let mut rewrote_delta_layers = Vec::new(); for (key, writers) in delta_layer_rewriters { if let Some(delta_writer_before) = writers.before { let (desc, path) = delta_writer_before .finish(job_desc.compaction_key_range.start, ctx) - .await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } if let Some(delta_writer_after) = writers.after { - let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?; - let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + let (desc, path) = delta_writer_after + .finish(key.key_range.end, ctx) + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)?; + let layer = Layer::finish_creating(self.conf, self, desc, &path) + .context("failed to finish creating delta layer") + .map_err(CompactionError::Other)?; rewrote_delta_layers.push(layer); } } @@ -3084,7 +3251,9 @@ impl Timeline { let end_key = job_desc.compaction_key_range.end; writer .finish_with_discard_fn(self, ctx, end_key, discard) - .await? + .await + .context("failed to finish image layer writer") + .map_err(CompactionError::Other)? } else { drop(writer); Vec::new() @@ -3096,7 +3265,9 @@ impl Timeline { let produced_delta_layers = if !dry_run { delta_layer_writer .finish_with_discard_fn(self, ctx, discard) - .await? + .await + .context("failed to finish delta layer writer") + .map_err(CompactionError::Other)? } else { drop(delta_layer_writer); Vec::new() @@ -3108,6 +3279,13 @@ impl Timeline { let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); let produced_image_layers_len = produced_image_layers.len(); + + let layer_selection_by_key = job_desc + .selected_layers + .iter() + .map(|l| (l.layer_desc().key(), l.layer_desc().clone())) + .collect::>(); + for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { @@ -3121,8 +3299,16 @@ impl Timeline { if cfg!(debug_assertions) { info!("discarded delta layer: {}", l); } + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_delta_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_delta_layer(0); + } keep_layers.insert(l); - stat.discard_delta_layer(); } } } @@ -3131,6 +3317,9 @@ impl Timeline { "produced rewritten delta layer: {}", layer.layer_desc().key() ); + // For now, we include rewritten delta layer size in the "produce_delta_layer". We could + // make it a separate statistics in the future. + stat.produce_delta_layer(layer.layer_desc().file_size()); } compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { @@ -3142,8 +3331,16 @@ impl Timeline { } BatchWriterResult::Discarded(l) => { debug!("discarded image layer: {}", l); + if let Some(layer_desc) = layer_selection_by_key.get(&l) { + stat.discard_image_layer(layer_desc.file_size()); + } else { + tracing::warn!( + "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?", + l + ); + stat.discard_image_layer(0); + } keep_layers.insert(l); - stat.discard_image_layer(); } } } @@ -3176,7 +3373,9 @@ impl Timeline { &layer.layer_desc().key_range, &job_desc.compaction_key_range, ) { - bail!("violated constraint: image layer outside of compaction key range"); + return Err(CompactionError::Other(anyhow!( + "violated constraint: image layer outside of compaction key range" + ))); } if !fully_contains( &job_desc.compaction_key_range, @@ -3189,13 +3388,25 @@ impl Timeline { layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + let time_final_phase = timer.elapsed(); + + stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_main_loop_secs = time_main_loop.as_secs_f64(); + stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); + stat.time_download_layer_secs = time_download_layer.as_secs_f64(); + stat.time_analyze_secs = time_analyze.as_secs_f64(); + stat.time_total_secs = begin_timer.elapsed().as_secs_f64(); + stat.finalize(); + info!( "gc-compaction statistics: {}", - serde_json::to_string(&stat)? + serde_json::to_string(&stat) + .context("failed to serialize gc-compaction statistics") + .map_err(CompactionError::Other)? ); if dry_run { - return Ok(()); + return Ok(CompactionOutcome::Done); } info!( @@ -3230,10 +3441,10 @@ impl Timeline { // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. if let Some(err) = check_valid_layermap(&final_layers) { - bail!( + return Err(CompactionError::Other(anyhow!( "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err - ); + ))); } // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only @@ -3285,7 +3496,9 @@ impl Timeline { // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should // be batched into `schedule_compaction_update`. let disk_consistent_lsn = self.disk_consistent_lsn.load(); - self.schedule_uploads(disk_consistent_lsn, None)?; + self.schedule_uploads(disk_consistent_lsn, None) + .context("failed to schedule uploads") + .map_err(CompactionError::Other)?; // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead // of `compact_from`. let compact_from = { @@ -3312,7 +3525,7 @@ impl Timeline { drop(gc_lock); - Ok(()) + Ok(CompactionOutcome::Done) } } @@ -3418,6 +3631,7 @@ impl CompactionJobExecutor for TimelineAdaptor { async fn downcast_delta_layer( &self, layer: &OwnArc, + ctx: &RequestContext, ) -> anyhow::Result> { // this is a lot more complex than a simple downcast... if layer.is_delta() { @@ -3425,7 +3639,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let guard = self.timeline.layers.read().await; guard.get_from_desc(layer) }; - let result = l.download_and_keep_resident().await?; + let result = l.download_and_keep_resident(ctx).await?; Ok(Some(ResidentDeltaLayer(result))) } else { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 7cdc69e55f..740f590735 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -11,6 +11,7 @@ use utils::id::TimelineId; use utils::{crashsafe, fs_ext, pausable_failpoint}; use crate::config::PageServerConf; +use crate::context::RequestContext; use crate::task_mgr::{self, TaskKind}; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::{ @@ -291,10 +292,11 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, local_metadata: &TimelineMetadata, remote_client: RemoteTimelineClient, + ctx: &RequestContext, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. - let timeline = tenant + let (timeline, _timeline_ctx) = tenant .create_timeline_struct( timeline_id, local_metadata, @@ -306,6 +308,8 @@ impl DeleteTimelineFlow { CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here None, // doesn't matter what we put here + None, // doesn't matter what we put here + ctx, ) .context("create_timeline_struct")?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index cad4c3ac64..81d94105ee 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -12,6 +12,7 @@ use utils::completion; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; +use utils::sync::gate::GateError; use super::layer_manager::LayerManager; use super::{FlushLayerError, Timeline}; @@ -363,14 +364,25 @@ pub(super) async fn prepare( let mut tasks = tokio::task::JoinSet::new(); let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); + let cancel_eval = CancellationToken::new(); for adopted in rest_of_historic { let limiter = limiter.clone(); let timeline = detached.clone(); + let cancel_eval = cancel_eval.clone(); tasks.spawn( async move { - let _permit = limiter.acquire().await; + let _permit = tokio::select! { + permit = limiter.acquire() => { + permit + } + // Wait for the cancellation here instead of letting the entire task be cancelled. + // Cancellations are racy in that they might leave layers on disk. + _ = cancel_eval.cancelled() => { + Err(Error::ShuttingDown)? + } + }; let (owned, did_hardlink) = remote_copy( &adopted, &timeline, @@ -386,7 +398,22 @@ pub(super) async fn prepare( ); } + fn delete_layers(timeline: &Timeline, layers: Vec) -> Result<(), Error> { + // We are deleting layers, so we must hold the gate + let _gate = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + { + layers.into_iter().for_each(|l: Layer| { + l.delete_on_drop(); + std::mem::drop(l); + }); + } + Ok(()) + } + let mut should_fsync = false; + let mut first_err = None; while let Some(res) = tasks.join_next().await { match res { Ok(Ok((owned, did_hardlink))) => { @@ -395,13 +422,24 @@ pub(super) async fn prepare( } new_layers.push(owned); } + + // Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete. Ok(Err(failed)) => { - return Err(failed); + cancel_eval.cancel(); + first_err.get_or_insert(failed); + } + Err(je) => { + cancel_eval.cancel(); + first_err.get_or_insert(Error::Prepare(je.into())); } - Err(je) => return Err(Error::Prepare(je.into())), } } + if let Some(failed) = first_err { + delete_layers(detached, new_layers)?; + return Err(failed); + } + // fsync directory again if we hardlinked something if should_fsync { fsync_timeline_dir(detached, ctx).await; @@ -592,7 +630,7 @@ async fn copy_lsn_prefix( .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) .map_err(Error::Prepare)?; - let resident = layer.download_and_keep_resident().await.map_err(|e| { + let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| { if e.is_cancelled() { Error::ShuttingDown } else { @@ -650,6 +688,11 @@ async fn remote_copy( let conf = adoptee.conf; let file_name = adopted.layer_desc().layer_name(); + // We don't want to shut the timeline down during this operation because we do `delete_on_drop` below + let _gate = adoptee.gate.enter().map_err(|e| match e { + GateError::GateClosed => Error::ShuttingDown, + })?; + // depending if Layer::keep_resident, do a hardlink let did_hardlink; let owned = if let Some(adopted_resident) = adopted.keep_resident().await { @@ -661,8 +704,32 @@ async fn remote_copy( &file_name, &metadata.generation, ); - std::fs::hard_link(adopted_path, &adoptee_path) - .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + + match std::fs::hard_link(adopted_path, &adoptee_path) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + // In theory we should not get into this situation as we are doing cleanups of the layer file after errors. + // However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch. + + // Double check that the file is orphan (probably from an earlier attempt), then delete it + let key = file_name.clone().into(); + if adoptee.layers.read().await.contains_key(&key) { + // We are supposed to filter out such cases before coming to this function + return Err(Error::Prepare(anyhow::anyhow!( + "layer file {file_name} already present and inside layer map" + ))); + } + tracing::info!("Deleting orphan layer file to make way for hard linking"); + // Delete orphan layer file and try again, to ensure this layer has a well understood source + std::fs::remove_file(adopted_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + } + Err(e) => { + return Err(Error::launder(e.into(), Error::Prepare)); + } + }; did_hardlink = true; Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() } else { @@ -670,12 +737,21 @@ async fn remote_copy( Layer::for_evicted(conf, adoptee, file_name, metadata) }; - let layer = adoptee + let layer = match adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await - .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare))?; + { + Ok(()) => owned, + Err(e) => { + { + // Clean up the layer so that on a retry we don't get errors that the file already exists + owned.delete_on_drop(); + std::mem::drop(owned); + } + return Err(Error::launder(e, Error::Prepare)); + } + }; Ok((layer, did_hardlink)) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 187d9f248e..397e8e8978 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -93,7 +93,8 @@ impl Timeline { } } - let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); + let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn) + .with_scope_timeline(&self); loop { let policy = self.get_eviction_policy(); let cf = self diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 67fb89c433..809b350f38 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -1,5 +1,4 @@ -//! An efficient way to keep the timeline gate open without preventing -//! timeline shutdown for longer than a single call to a timeline method. +//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`. //! //! # Motivation //! @@ -19,27 +18,32 @@ //! we hold the Timeline gate open while we're invoking the method on the //! Timeline object. //! -//! However, we want to avoid the overhead of entering the gate for every -//! method invocation. -//! -//! Further, for shard routing, we want to avoid calling the tenant manager to -//! resolve the shard for every request. Instead, we want to cache the -//! routing result so we can bypass the tenant manager for all subsequent requests -//! that get routed to that shard. +//! We want to avoid the overhead of doing, for each incoming request, +//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! - cloning the `Arc` out of the tenant manager so we can +//! release the mgr rwlock before doing any request processing work +//! - re-entering the Timeline gate for each Timeline method invocation. //! //! Regardless of how we accomplish the above, it should not //! prevent the Timeline from shutting down promptly. //! +//! //! # Design //! //! ## Data Structures //! -//! There are three user-facing data structures: +//! There are two concepts expressed as associated types in the `Types` trait: +//! - `TenantManager`: the thing that performs the expensive work. It produces +//! a `Timeline` object, which is the other associated type. +//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup. +//! +//! There are three user-facing data structures exposed by this module: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. -//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. +//! - `Handle`: a smart pointer that derefs to the Types::Timeline. //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows -//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. +//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always +//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`. //! //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. @@ -64,11 +68,14 @@ //! //! To dispatch a request, the page service connection calls `Cache::get`. //! -//! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and store it in the the -//! `Arc>>`. A weak ref is stored in the `Cache` +//! A cache miss means we call Types::TenantManager::resolve for shard routing, +//! cloning the `Arc` out of it, and entering the gate. The result of +//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls. +//! +//! We wrap the object returned from resolve() in an `Arc` and store that inside the +//! `Arc>>`. A weak ref to the HandleInner is stored in the `Cache` //! and a strong ref in the `PerTimelineState`. -//! A strong ref is returned wrapped in a `Handle`. +//! Another strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing //! and find the weak ref in the cache. @@ -78,51 +85,51 @@ //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` //! and the request handler dispatches the request to the right `>::$request_method`. -//! It then drops the `Handle`, which drops the `Arc`. +//! It then drops the `Handle`, and thus the `Arc>` inside it. //! //! # Performance //! //! Remember from the introductory section: //! -//! > However, we want to avoid the overhead of entering the gate for every -//! > method invocation. +//! > We want to avoid the overhead of doing, for each incoming request, +//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing) +//! > - cloning the `Arc` out of the tenant manager so we can +//! > release the mgr rwlock before doing any request processing work +//! > - re-entering the Timeline gate for each Timeline method invocation. //! -//! Why do we want to avoid that? -//! Because the gate is a shared location in memory and entering it involves -//! bumping refcounts, which leads to cache contention if done frequently -//! from multiple cores in parallel. +//! All of these boil down to some state that is either globally shared among all shards +//! or state shared among all tasks that serve a particular timeline. +//! It is either protected by RwLock or manipulated via atomics. +//! Even atomics are costly when shared across multiple cores. +//! So, we want to avoid any permanent need for coordination between page_service tasks. //! -//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. -//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! The solution is to add indirection: we wrap the Types::Timeline object that is +//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner` +//! and hence to the single Cache / page_service connection. //! (Review the "Data Structures" section if that is unclear to you.) //! -//! A `WeakHandle` is a weak ref to the `HandleInner`. -//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and -//! further acquire an additional strong ref to the `Arc` inside it. -//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. //! -//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. -//! Again, this is cheap because the `Arc` is private to the connection. +//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex`), +//! lock the mutex, take out a clone of the `Arc`, and drop the Mutex. +//! The Mutex is not contended because it is private to the connection. +//! And again, the `Arc` clone is cheap because that wrapper +//! Arc's refcounts are private to the connection. +//! +//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection. //! -//! In addition to the GateGuard, we need to provide `Deref` impl. -//! For this, both `Handle` need infallible access to an `Arc`. -//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention -//! on the shared memory location that trakcs the refcount of the `Arc`. -//! Instead, we wrap the `Arc` into another `Arc`. -//! so that we can clone it cheaply when upgrading a `WeakHandle`. //! //! # Shutdown //! //! The attentive reader may have noticed the following reference cycle around the `Arc`: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline //! ``` //! //! Further, there is this cycle: //! //! ```text -//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline //! ``` //! //! The former cycle is a memory leak if not broken. @@ -135,9 +142,12 @@ //! - Timeline shutdown (=> `PerTimelineState::shutdown`) //! - Connection shutdown (=> dropping the `Cache`). //! -//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to -//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the -//! `Arc`. +//! Both transition the `HandleInner` from [`HandleInner::Open`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived +//! `Arc`. Once the last short-lived Arc +//! is dropped, the `Types::Timeline` gets dropped and thereby +//! the `GateGuard` and the `Arc` that it stores, +//! thereby breaking both cycles. //! //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, //! thereby breaking the cycle. @@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector; pub(crate) trait Types: Sized + std::fmt::Debug { type TenantManagerError: Sized + std::fmt::Debug; type TenantManager: TenantManager + Sized; - type Timeline: ArcTimeline + Sized; + type Timeline: Timeline + Sized; } /// Uniquely identifies a [`Cache`] instance over the lifetime of the process. @@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId { /// See module-level comment. pub(crate) struct Handle { - timeline: Arc, - #[allow(dead_code)] // the field exists to keep the gate open - gate_guard: Arc, inner: Arc>>, + open: Arc, } pub(crate) struct WeakHandle { inner: Weak>>, } + enum HandleInner { - KeepingTimelineGateOpen { - #[allow(dead_code)] - gate_guard: Arc, - timeline: Arc, - }, + Open(Arc), ShutDown, } @@ -307,8 +312,7 @@ pub(crate) trait TenantManager { } /// Abstract view of an [`Arc`], for testability. -pub(crate) trait ArcTimeline: Clone { - fn gate(&self) -> &utils::sync::gate::Gate; +pub(crate) trait Timeline { fn shard_timeline_id(&self) -> ShardTimelineId; fn get_shard_identity(&self) -> &ShardIdentity; fn per_timeline_state(&self) -> &PerTimelineState; @@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline: Clone { #[derive(Debug)] pub(crate) enum GetError { TenantManager(T::TenantManagerError), - TimelineGateClosed, PerTimelineStateShutDown, } @@ -434,21 +437,9 @@ impl Cache { } trace!("creating new HandleInner"); - let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { - gate_guard: Arc::new( - // this enter() is expensive in production code because - // it hits the global Arc::gate refcounts - match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }, - ), - // this clone is expensive in production code because - // it hits the global Arc::clone refcounts - timeline: Arc::new(timeline.clone()), - })); + let timeline = Arc::new(timeline); + let handle_inner_arc = + Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline)))); let handle_weak = WeakHandle { inner: Arc::downgrade(&handle_inner_arc), }; @@ -503,18 +494,10 @@ impl WeakHandle { }; let lock_guard = inner.lock().expect("poisoned"); match &*lock_guard { - HandleInner::KeepingTimelineGateOpen { - timeline, - gate_guard, - } => { - let gate_guard = Arc::clone(gate_guard); - let timeline = Arc::clone(timeline); + HandleInner::Open(open) => { + let open = Arc::clone(open); drop(lock_guard); - Ok(Handle { - timeline, - gate_guard, - inner, - }) + Ok(Handle { open, inner }) } HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), } @@ -528,7 +511,7 @@ impl WeakHandle { impl std::ops::Deref for Handle { type Target = T::Timeline; fn deref(&self) -> &Self::Target { - &self.timeline + &self.open } } @@ -545,7 +528,7 @@ impl PerTimelineState { /// to the [`Types::Timeline`] that embeds this per-timeline state. /// Even if [`TenantManager::resolve`] would still resolve to it. /// - /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive. + /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive. /// That's ok because they're short-lived. See module-level comment for details. #[instrument(level = "trace", skip_all)] pub(super) fn shutdown(&self) { @@ -611,7 +594,7 @@ impl Drop for Cache { impl HandleInner { fn shutdown(&mut self) -> Option> { match std::mem::replace(self, HandleInner::ShutDown) { - HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline), + HandleInner::Open(timeline) => Some(timeline), HandleInner::ShutDown => { // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown // may do it concurrently, but locking rules disallow holding per-timeline-state lock and @@ -631,6 +614,7 @@ mod tests { use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; + use utils::sync::gate::GateGuard; use super::*; @@ -641,7 +625,7 @@ mod tests { impl Types for TestTypes { type TenantManagerError = anyhow::Error; type TenantManager = StubManager; - type Timeline = Arc; + type Timeline = Entered; } struct StubManager { @@ -656,17 +640,19 @@ mod tests { myself: Weak, } + struct Entered { + timeline: Arc, + #[allow(dead_code)] // it's stored here to keep the gate open + gate_guard: Arc, + } + impl StubTimeline { fn getpage(&self) { // do nothing } } - impl ArcTimeline for Arc { - fn gate(&self) -> &utils::sync::gate::Gate { - &self.gate - } - + impl Timeline for Entered { fn shard_timeline_id(&self) -> ShardTimelineId { ShardTimelineId { shard_index: self.shard.shard_index(), @@ -688,20 +674,34 @@ mod tests { &self, timeline_id: TimelineId, shard_selector: ShardSelector, - ) -> anyhow::Result> { + ) -> anyhow::Result { for timeline in &self.shards { if timeline.id == timeline_id { + let enter_gate = || { + let gate_guard = timeline.gate.enter()?; + let gate_guard = Arc::new(gate_guard); + anyhow::Ok(gate_guard) + }; match &shard_selector { ShardSelector::Zero if timeline.shard.is_shard_zero() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Zero => continue, ShardSelector::Page(key) if timeline.shard.is_key_local(key) => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Page(_) => continue, ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => { - return Ok(Arc::clone(timeline)); + return Ok(Entered { + timeline: Arc::clone(timeline), + gate_guard: enter_gate()?, + }); } ShardSelector::Known(_) => continue, } @@ -711,6 +711,13 @@ mod tests { } } + impl std::ops::Deref for Entered { + type Target = StubTimeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } + } + #[tokio::test(start_paused = true)] async fn test_timeline_shutdown() { crate::tenant::harness::setup_logging(); @@ -1038,7 +1045,6 @@ mod tests { let key = DBDIR_KEY; // Simulate 10 connections that's opened, used, and closed - let mut used_handles = vec![]; for _ in 0..10 { let mut cache = Cache::::default(); let handle = { @@ -1050,7 +1056,6 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.timeline)); } // No handles exist, thus gates are closed and don't require shutdown. diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs index 27243ba378..11df232a10 100644 --- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -10,6 +10,8 @@ use http_utils::error::ApiError; use tokio_util::sync::CancellationToken; use utils::sync::gate::Gate; +use crate::context::RequestContext; + use super::Timeline; // This status is not strictly necessary now, but gives us a nice place @@ -30,6 +32,8 @@ impl HeatmapLayersDownloader { fn new( timeline: Arc, concurrency: usize, + recurse: bool, + ctx: RequestContext, ) -> Result { let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; @@ -57,12 +61,13 @@ impl HeatmapLayersDownloader { tracing::info!( resident_size=%timeline.resident_physical_size(), - heatmap_layers=%heatmap.layers.len(), + heatmap_layers=%heatmap.all_layers().count(), "Starting heatmap layers download" ); - let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map( |layer| { + let ctx = ctx.attached_child(); let tl = timeline.clone(); let dl_guard = match downloads_guard.enter() { Ok(g) => g, @@ -75,7 +80,7 @@ impl HeatmapLayersDownloader { Some(async move { let _dl_guard = dl_guard; - let res = tl.download_layer(&layer.name).await; + let res = tl.download_layer(&layer.name, &ctx).await; if let Err(err) = res { if !err.is_cancelled() { tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") @@ -94,6 +99,20 @@ impl HeatmapLayersDownloader { }, _ = cancel.cancelled() => { tracing::info!("Heatmap layers download cancelled"); + return; + } + } + + if recurse { + if let Some(ancestor) = timeline.ancestor_timeline() { + let ctx = ctx.attached_child(); + let res = + ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx); + if let Err(err) = res { + tracing::info!( + "Failed to start heatmap layers download for ancestor: {err}" + ); + } } } } @@ -136,13 +155,20 @@ impl HeatmapLayersDownloader { } impl Timeline { - pub(crate) async fn start_heatmap_layers_download( + pub(crate) fn start_heatmap_layers_download( self: &Arc, concurrency: usize, + recurse: bool, + ctx: &RequestContext, ) -> Result<(), ApiError> { let mut locked = self.heatmap_layers_downloader.lock().unwrap(); if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { - let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + let dl = HeatmapLayersDownloader::new( + self.clone(), + concurrency, + recurse, + ctx.attached_child(), + )?; *locked = Some(dl); Ok(()) } else { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e552ea83de..1b489028dc 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -8,14 +8,14 @@ use tracing::trace; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; -use super::TimelineWriterState; +use super::{ReadableLayer, TimelineWriterState}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::metrics::TimelineMetrics; use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; use crate::tenant::storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, - PersistentLayerKey, ResidentLayer, + PersistentLayerKey, ReadableLayerWeak, ResidentLayer, }; /// Provides semantic APIs to manipulate the layer map. @@ -37,6 +37,21 @@ impl Default for LayerManager { } impl LayerManager { + pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { + match weak { + ReadableLayerWeak::PersistentLayer(desc) => { + ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) + } + ReadableLayerWeak::InMemoryLayer(desc) => { + let inmem = self + .layer_map() + .expect("no concurrent shutdown") + .in_memory_layer(&desc); + ReadableLayer::InMemoryLayer(inmem) + } + } + } + pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { // The assumption for the `expect()` is that all code maintains the following invariant: // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. @@ -470,6 +485,25 @@ impl OpenLayerManager { mapping.remove(layer); layer.delete_on_drop(); } + + #[cfg(test)] + pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc) { + use pageserver_api::models::InMemoryLayerInfo; + + match layer.info() { + InMemoryLayerInfo::Open { .. } => { + assert!(self.layer_map.open_layer.is_none()); + self.layer_map.open_layer = Some(layer); + } + InMemoryLayerInfo::Frozen { lsn_start, .. } => { + if let Some(last) = self.layer_map.frozen_layers.back() { + assert!(last.get_lsn_range().end <= lsn_start); + } + + self.layer_map.frozen_layers.push_back(layer); + } + } + } } pub(crate) struct LayerFileManager(HashMap); diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index c017383121..ef42940481 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -961,7 +961,8 @@ mod tests { } async fn round_trip_test_compressed(blobs: &[Vec], compression: bool) -> Result<(), Error> { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let (_temp_dir, pathbuf, offsets) = write_maybe_compressed(blobs, compression, &ctx).await?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 1f5a820ce7..2ed035b489 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -26,15 +26,14 @@ use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; pub use pageserver_api::models::virtual_file as api; -use pageserver_api::shard::TenantShardId; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; +use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::RequestContext; -use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation}; +use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation}; use crate::page_cache::{PAGE_SZ, PageWriteGuard}; -use crate::tenant::TENANTS_SEGMENT_NAME; pub(crate) mod io_engine; pub use io_engine::{ FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, @@ -121,7 +120,7 @@ impl VirtualFile { pub async fn open_with_options>( path: P, open_options: &OpenOptions, - ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> Result { let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { @@ -133,7 +132,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, open_options: &OpenOptions, - ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> Result { let file = match get_io_mode() { IoMode::Buffered => { @@ -300,13 +299,6 @@ pub struct VirtualFileInner { /// storing it here. pub path: Utf8PathBuf, open_options: OpenOptions, - - // These are strings becase we only use them for metrics, and those expect strings. - // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into - // strings. - tenant_id: String, - shard_id: String, - timeline_id: String, } #[derive(Debug, PartialEq, Clone, Copy)] @@ -588,36 +580,16 @@ impl VirtualFileInner { pub async fn open_with_options>( path: P, open_options: &OpenOptions, - _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + _ctx: &RequestContext, ) -> Result { - let path_ref = path.as_ref(); - let path_str = path_ref.to_string(); - let parts = path_str.split('/').collect::>(); - let (tenant_id, shard_id, timeline_id) = - if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { - let tenant_shard_part = parts[parts.len() - 4]; - let (tenant_id, shard_id) = match tenant_shard_part.parse::() { - Ok(tenant_shard_id) => ( - tenant_shard_id.tenant_id.to_string(), - format!("{}", tenant_shard_id.shard_slug()), - ), - Err(_) => { - // Malformed path: this ID is just for observability, so tolerate it - // and pass through - (tenant_shard_part.to_string(), "*".to_string()) - } - }; - (tenant_id, shard_id, parts[parts.len() - 2].to_string()) - } else { - ("*".to_string(), "*".to_string(), "*".to_string()) - }; + let path = path.as_ref(); let (handle, mut slot_guard) = get_open_files().find_victim_slot().await; // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. let file = observe_duration!(StorageIoOperation::Open, { - open_options.open(path_ref.as_std_path()).await? + open_options.open(path.as_std_path()).await? }); // Strip all options other than read and write. @@ -633,11 +605,8 @@ impl VirtualFileInner { let vfile = VirtualFileInner { handle: RwLock::new(handle), pos: 0, - path: path_ref.to_path_buf(), + path: path.to_owned(), open_options: reopen_options, - tenant_id, - shard_id, - timeline_id, }; // TODO: Under pressure, it's likely the slot will get re-used and @@ -934,7 +903,7 @@ impl VirtualFileInner { &self, buf: tokio_epoll_uring::Slice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (tokio_epoll_uring::Slice, Result) where Buf: tokio_epoll_uring::IoBufMut + Send, @@ -952,14 +921,7 @@ impl VirtualFileInner { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { - STORAGE_IO_SIZE - .with_label_values(&[ - "read", - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .add(size as i64); + ctx.io_size_metrics().read.add(size.into_u64()); } (buf, res) }) @@ -970,9 +932,9 @@ impl VirtualFileInner { &self, buf: FullSlice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (FullSlice, Result) { - let (slice, result) = self.write_at_inner(buf, offset, _ctx).await; + let (slice, result) = self.write_at_inner(buf, offset, ctx).await; let result = result.maybe_fatal_err("write_at"); (slice, result) } @@ -981,7 +943,7 @@ impl VirtualFileInner { &self, buf: FullSlice, offset: u64, - _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ctx: &RequestContext, ) -> (FullSlice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -991,14 +953,7 @@ impl VirtualFileInner { let ((_file_guard, buf), result) = io_engine::get().write_at(file_guard, offset, buf).await; if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&[ - "write", - &self.tenant_id, - &self.shard_id, - &self.timeline_id, - ]) - .add(size as i64); + ctx.io_size_metrics().write.add(size.into_u64()); } (buf, result) }) @@ -1584,7 +1539,8 @@ mod tests { where A: Adapter, { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; @@ -1711,7 +1667,8 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -1770,7 +1727,8 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1798,7 +1756,8 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index a90226b783..6804146d9a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -181,7 +181,8 @@ where Err(self .shutdown() .await - .expect_err("flush task only disconnects duplex if it exits with an error")) + .err() + .expect("flush task only disconnects duplex if it exits with an error")) } /// Cleans up the channel, join the flush task. diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 5a9fc63e63..6d4a38d4ff 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -136,7 +136,9 @@ impl WalRedoProcess { Ok(0) => break Ok(()), // eof Ok(num_bytes) => { let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); + if !output.contains("LOG:") { + error!(%output, "received output"); + } } Err(e) => { break Err(e); diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index f6a577abfc..f13522e55b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1195,9 +1195,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, state = GET_STATE(entry, chunk_offs + i); if (state == PENDING) { SET_STATE(entry, chunk_offs + i, REQUESTED); - } else if (state != REQUESTED) { + } else if (state == UNAVAILABLE) { SET_STATE(entry, chunk_offs + i, PENDING); break; + } else if (state == AVAILABLE) { + break; } if (!sleeping) { @@ -1369,6 +1371,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->limit; break; + case 8: + key = "file_cache_chunk_size_pages"; + value = BLOCKS_PER_CHUNK; + break; default: SRF_RETURN_DONE(funcctx); } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index f71f11ff93..637281fe4a 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -16,6 +16,8 @@ #include +#include "libpq-int.h" + #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" @@ -815,9 +817,10 @@ retry: get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); neon_shard_log(shard_no, LOG, - "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d) (conn start=%d end=%d)", INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf, + pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; shard->receive_logged = true; } @@ -1099,6 +1102,10 @@ pageserver_try_receive(shardno_t shard_no) { neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); + /* + * Malformed responses from PageServer are a reason to raise + * errors and cancel transactions. + */ PG_RE_THROW(); } PG_END_TRY(); @@ -1122,7 +1129,8 @@ pageserver_try_receive(shardno_t shard_no) char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: could not read COPY data: %s", msg); + resp = NULL; } else { @@ -1321,6 +1329,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.readahead_getpage_pull_timeout", + "readahead response pull timeout", + "Time between active tries to pull data from the " + "PageStream connection when we have pages which " + "were read ahead but not yet received.", + &readahead_getpage_pull_timeout_ms, + 0, 0, 5 * 60 * 1000, + PGC_USERSET, + GUC_UNIT_MS, + NULL, NULL, NULL); DefineCustomIntVariable("neon.protocol_version", "Version of compute<->page server protocol", NULL, @@ -1334,7 +1352,7 @@ pg_init_libpagestore(void) DefineCustomIntVariable("neon.pageserver_response_log_timeout", "pageserver response log timeout", - "If the pageserver doesn't respond to a request within this timeout," + "If the pageserver doesn't respond to a request within this timeout, " "a message is printed to the log.", &pageserver_response_log_timeout, 10000, 100, INT_MAX, @@ -1344,7 +1362,7 @@ pg_init_libpagestore(void) DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", "pageserver response diconnect timeout", - "If the pageserver doesn't respond to a request within this timeout," + "If the pageserver doesn't respond to a request within this timeout, " "disconnect and reconnect.", &pageserver_response_disconnect_timeout, 120000, 100, INT_MAX, diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 768d7ae9e8..0f226cc9e2 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -437,6 +437,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + pagestore_smgr_init(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); @@ -456,6 +457,15 @@ _PG_init(void) PGC_SIGHUP, 0, NULL, NULL, NULL); + DefineCustomBoolVariable( + "neon.disable_wal_prevlink_checks", + "Disable validation of prev link in WAL records", + NULL, + &disable_wal_prev_lsn_checks, + false, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); DefineCustomBoolVariable( "neon.allow_replica_misconfig", diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 912e09c3d3..c9beb8c318 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -22,6 +22,8 @@ extern char *neon_tenant; extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; +extern int readahead_getpage_pull_timeout_ms; +extern bool disable_wal_prev_lsn_checks; #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; @@ -49,6 +51,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 9faab1e4f0..475697f9c0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -209,7 +209,11 @@ typedef struct NeonResponse *(*receive) (shardno_t shard_no); /* * Try get the next response from the TCP buffers, if any. - * Returns NULL when the data is not yet available. + * Returns NULL when the data is not yet available. + * + * This will raise errors only for malformed responses (we can't put them + * back into connection). All other error conditions are soft errors and + * return NULL as "no response available". */ NeonResponse *(*try_receive) (shardno_t shard_no); /* diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 091ad555e0..0414661a5f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -65,10 +65,12 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" +#include "utils/timeout.h" +#include "bitmap.h" +#include "neon.h" #include "neon_perf_counters.h" #include "pagestore_client.h" -#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -123,6 +125,45 @@ static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); static uint32 local_request_counter; #define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void reconfigure_timeout_if_needed(void); +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + /* * Prefetch implementation: * @@ -221,7 +262,6 @@ typedef struct PrfHashEntry #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" -#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -407,17 +447,26 @@ compact_prefetch_buffers(void) } /* - * If there might be responses still in the TCP buffer, then - * we should try to use those, so as to reduce any TCP backpressure - * on the OS/PS side. + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. * * This procedure handles that. * - * Note that this is only valid as long as the only pipelined - * operations in the TCP buffer are getPage@Lsn requests. + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. */ static void -prefetch_pump_state(void) +prefetch_pump_state(bool IsHandlingInterrupts) { while (MyPState->ring_receive != MyPState->ring_flush) { @@ -466,6 +515,12 @@ prefetch_pump_state(void) } } } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + reconfigure_timeout_if_needed(); } void @@ -581,8 +636,8 @@ readahead_buffer_resize(int newsize, void *extra) /* * Make sure that there are no responses still in the buffer. * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. */ static void consume_prefetch_responses(void) @@ -639,6 +694,7 @@ static bool prefetch_wait_for(uint64 ring_index) { PrefetchRequest *entry; + bool result = true; if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) @@ -652,13 +708,21 @@ prefetch_wait_for(uint64 ring_index) while (MyPState->ring_receive <= ring_index) { + START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); if (!prefetch_read(entry)) - return false; + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); } - return true; + + return result; } /* @@ -962,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n if (!neon_prefetch_response_usable(&lsns[i], slot)) continue; + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); prefetch_set_unused(ring_index); BITMAP_SET(mask, i); @@ -1316,6 +1393,12 @@ page_server_request(void const *req) page_server->disconnect(shard_no); MyNeonCounters->pageserver_open_requests = 0; + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + PG_RE_THROW(); } PG_END_TRY(); @@ -2943,7 +3026,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MyPState->ring_last <= ring_index); } - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -2986,7 +3069,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); - prefetch_pump_state(); + prefetch_pump_state(false); return false; } @@ -3030,7 +3113,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3278,7 +3361,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); @@ -3300,7 +3383,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3411,7 +3494,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(); + prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -3456,7 +3539,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3626,7 +3709,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3681,7 +3764,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3972,7 +4055,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(); + prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4273,6 +4356,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf } pfree(resp); + reconfigure_timeout_if_needed(); return n_blocks; } @@ -4308,6 +4392,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } + reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4564,3 +4649,94 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } + +static void +reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +static process_interrupts_callback_t prev_interrupt_cb; + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +pagestore_smgr_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + prefetch_pump_state(true); + + timeout_signaled = false; + reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} + + +void +pagestore_smgr_init(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = pagestore_smgr_processinterrupts; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index d7604e30d7..7ec4ec99fc 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst); static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * @@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; - for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep) + wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); + + /* + * If safekeepers list starts with g# parse generation number followed by + * : + */ + if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0) + { + char *endptr; + + errno = 0; + wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10); + if (errno != 0) + { + wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); + } + /* Skip past : to the first hostname. */ + host = endptr + 1; + } + else + { + host = wp->config->safekeepers_list; + } + wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); + + for (; host != NULL && *host != '\0'; host = sep) { port = strchr(host, ':'); if (port == NULL) @@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp) pfree(wp); } +static bool +WalProposerGenerationsEnabled(WalProposer *wp) +{ + return wp->safekeepers_generation != 0; +} + /* * Create new AppendRequest message and start sending it. This function is * called from walsender every time the new WAL is available. @@ -600,10 +632,14 @@ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; + + /* Forbid implicit timeline creation if generations are enabled. */ + char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true"; #define CMD_LEN 512 char cmd[CMD_LEN]; - snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version); + + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation); if (!wp->api.conn_send_query(sk, cmd)) { wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", @@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk) sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); pfree(mconf_toml); + /* + * Adopt mconf of safekeepers if it is higher. TODO: mconf change should + * restart wp if it started voting. + */ + if (sk->greetResponse.mconf.generation > wp->mconf.generation) + { + MembershipConfigurationFree(&wp->mconf); + MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + /* full conf was just logged above */ + wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); + } + /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -1896,7 +1944,13 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf pq_sendint64_le(buf, m->termHistory->entries[i].term); pq_sendint64_le(buf, m->termHistory->entries[i].lsn); } - pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */ + + /* + * Removed timeline_start_lsn. Still send it as a valid + * value until safekeepers taking it from term history are + * deployed. + */ + pq_sendint64_le(buf, m->termHistory->entries[0].lsn); break; } case 'a': @@ -2157,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) } } wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); - return false; /* keep the compiler quiet */ + return false; /* keep the compiler quiet */ } /* @@ -2565,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf) return s.data; } +static void +MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst) +{ + dst->generation = src->generation; + dst->members.len = src->members.len; + dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len); + memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len); + dst->new_members.len = src->new_members.len; + dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len); + memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len); +} + static void MembershipConfigurationFree(MembershipConfiguration *mconf) { diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index eee55f924f..8d1ae26cac 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -160,7 +160,10 @@ typedef struct MemberSet SafekeeperId *m; /* ids themselves */ } MemberSet; -/* Timeline safekeeper membership configuration. */ +/* + * Timeline safekeeper membership configuration as sent in the + * protocol. + */ typedef struct MembershipConfiguration { Generation generation; @@ -761,8 +764,22 @@ typedef struct WalProposer /* (n_safekeepers / 2) + 1 */ int quorum; + /* + * Generation of the membership conf of which safekeepers[] are presumably + * members. To make cplane life a bit easier and have more control in + * tests with which sks walproposer gets connected neon.safekeepers GUC + * doesn't provide full mconf, only the list of endpoints to connect to. + * We still would like to know generation associated with it because 1) we + * need some handle to enforce using generations in walproposer, and + * non-zero value of this serves the purpose; 2) currently we don't do + * that, but in theory walproposer can update list of safekeepers to + * connect to upon receiving mconf from safekeepers, and generation number + * must be checked to see which list is newer. + */ + Generation safekeepers_generation; /* Number of occupied slots in safekeepers[] */ int n_safekeepers; + /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index a0fe3822cc..81198d6c8d 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -32,6 +32,8 @@ extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); extern bool GetDonorShmem(XLogRecPtr *donor_lsn); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); +bool disable_wal_prev_lsn_checks = false; + static XLogRecPtr NeonWALReadWaitForWAL(XLogRecPtr loc) { @@ -82,6 +84,8 @@ NeonWALPageRead( if (flushptr < targetPagePtr + reqLen) return -1; + xlogreader->skip_lsn_checks = disable_wal_prev_lsn_checks; + /* Read at most XLOG_BLCKSZ bytes */ if (targetPagePtr + XLOG_BLCKSZ <= flushptr) count = XLOG_BLCKSZ; diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 74cd5ac601..75b9ab4464 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -32,7 +32,7 @@ #include "inmem_smgr.h" -/* Size of the in-memory smgr */ +/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */ #define MAX_PAGES 64 /* If more than WARN_PAGES are used, print a warning in the log */ @@ -174,10 +174,7 @@ static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { - char buffer[BLCKSZ] = {0}; - - for (int i = 0; i < nblocks; i++) - inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync); + /* Do nothing: inmem_read will return zero page in any case */ } #endif @@ -285,12 +282,12 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * WARN_PAGES, print a warning so that we get alerted and get to * investigate why we're accessing so many buffers. */ - elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, - "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - blocknum, - used_pages); + if (used_pages >= WARN_PAGES) + ereport(WARNING, (errmsg("inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + blocknum, + used_pages), errbacktrace())); if (used_pages == MAX_PAGES) elog(ERROR, "Inmem storage overflow"); diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 619b7255ae..4673de778c 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -142,7 +142,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE LOG +#define TRACE DEBUG1 #ifdef HAVE_LIBSECCOMP @@ -194,6 +194,7 @@ static PgSeccompRule allowed_syscalls[] = * is stored in MyProcPid anyway. */ PG_SCMP_ALLOW(getpid), + PG_SCMP_ALLOW(futex), /* needed for errbacktrace */ /* Enable those for a proper shutdown. */ #if 0 @@ -253,7 +254,7 @@ WalRedoMain(int argc, char *argv[]) * which is super strange but that's not something we can solve * for here. ยฏ\_(-_-)_/ยฏ */ - SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("log_min_messages", "WARNING", PGC_SUSET, PGC_S_OVERRIDE); SetConfigOption("client_min_messages", "ERROR", PGC_SUSET, PGC_S_OVERRIDE); diff --git a/poetry.lock b/poetry.lock index ba3b0535e4..03aa543b06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1414,14 +1414,14 @@ files = [ [[package]] name = "jinja2" -version = "3.1.5" +version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, - {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, + {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, + {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, ] [package.dependencies] @@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" +content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5964b76ecf..b6e3f03a81 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry = { workspace = true, features = ["trace"] } -papaya = "0.1.8" +papaya = "0.2.0" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 9c3a3772cd..7a6dceb194 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -35,6 +35,7 @@ impl LocalBackend { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), project_id: ProjectIdTag::get_interner().get_or_intern("local"), branch_id: BranchIdTag::get_interner().get_or_intern("local"), + compute_id: "local".into(), cold_start_info: ColdStartInfo::WarmCached, }, }, diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 5447a4a4c0..3852bfe348 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,3 +1,4 @@ +use std::fmt::Debug; use std::io; use std::net::SocketAddr; use std::time::Duration; @@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; -use tokio::net::TcpStream; +use tokio::net::{TcpStream, lookup_host}; use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; @@ -180,21 +181,19 @@ impl ConnCfg { use postgres_client::config::Host; // wrap TcpStream::connect with timeout - let connect_with_timeout = |host, port| { - tokio::time::timeout(timeout, TcpStream::connect((host, port))).map( - move |res| match res { - Ok(tcpstream_connect_res) => tcpstream_connect_res, - Err(_) => Err(io::Error::new( - io::ErrorKind::TimedOut, - format!("exceeded connection timeout {timeout:?}"), - )), - }, - ) + let connect_with_timeout = |addrs| { + tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {timeout:?}"), + )), + }) }; - let connect_once = |host, port| { - debug!("trying to connect to compute node at {host}:{port}"); - connect_with_timeout(host, port).and_then(|stream| async { + let connect_once = |addrs| { + debug!("trying to connect to compute node at {addrs:?}"); + connect_with_timeout(addrs).and_then(|stream| async { let socket_addr = stream.peer_addr()?; let socket = socket2::SockRef::from(&stream); // Disable Nagle's algorithm to not introduce latency between @@ -216,7 +215,12 @@ impl ConnCfg { Host::Tcp(host) => host.as_str(), }; - match connect_once(host, port).await { + let addrs = match self.0.get_host_addr() { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)).await?.collect(), + }; + + match connect_once(&*addrs).await { Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), Err(err) => { warn!("couldn't connect to compute node at {host}:{port}: {err}"); @@ -277,13 +281,16 @@ impl ConnCfg { } = connection; tracing::Span::current().record("pid", tracing::field::display(process_id)); + tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); let stream = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), - "connected to compute node at {host} ({socket_addr}) sslmode={:?}", - self.0.get_ssl_mode() + "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}, query_id={}", + self.0.get_ssl_mode(), + ctx.get_proxy_latency(), + ctx.get_testodrome_id(), ); // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 74b48a1bea..4f72a86f30 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::error::ErrorKind; use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ - ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, + ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol, + Waiting, }; use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; @@ -55,12 +56,14 @@ struct RequestContextInner { dbname: Option, user: Option, application: Option, + user_agent: Option, error_kind: Option, pub(crate) auth_method: Option, jwt_issuer: Option, success: bool, pub(crate) cold_start_info: ColdStartInfo, pg_options: Option, + testodrome_query_id: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. @@ -100,6 +103,7 @@ impl Clone for RequestContext { dbname: inner.dbname.clone(), user: inner.user.clone(), application: inner.application.clone(), + user_agent: inner.user_agent.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), @@ -107,6 +111,7 @@ impl Clone for RequestContext { rejected: inner.rejected, cold_start_info: inner.cold_start_info, pg_options: inner.pg_options.clone(), + testodrome_query_id: inner.testodrome_query_id.clone(), sender: None, disconnect_sender: None, @@ -149,6 +154,7 @@ impl RequestContext { dbname: None, user: None, application: None, + user_agent: None, error_kind: None, auth_method: None, jwt_issuer: None, @@ -156,6 +162,7 @@ impl RequestContext { rejected: None, cold_start_info: ColdStartInfo::Unknown, pg_options: None, + testodrome_query_id: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), @@ -206,6 +213,19 @@ impl RequestContext { this.set_dbname(dbname.into()); } + // Try to get testodrome_query_id directly from parameters + if let Some(options_str) = options.get("options") { + // If not found directly, try to extract it from the options string + for option in options_str.split_whitespace() { + if option.starts_with("neon_query_id:") { + if let Some(value) = option.strip_prefix("neon_query_id:") { + this.set_testodrome_id(value.to_string()); + break; + } + } + } + } + this.pg_options = Some(options); } @@ -245,6 +265,13 @@ impl RequestContext { .set_user(user); } + pub(crate) fn set_user_agent(&self, user_agent: Option) { + self.0 + .try_lock() + .expect("should not deadlock") + .set_user_agent(user_agent); + } + pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) { let mut this = self.0.try_lock().expect("should not deadlock"); this.auth_method = Some(auth_method); @@ -336,6 +363,23 @@ impl RequestContext { } } + pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated { + self.0 + .try_lock() + .expect("should not deadlock") + .latency_timer + .accumulated() + } + + pub(crate) fn get_testodrome_id(&self) -> String { + self.0 + .try_lock() + .expect("should not deadlock") + .testodrome_query_id + .clone() + .unwrap_or_default() + } + pub(crate) fn success(&self) { self.0 .try_lock() @@ -384,6 +428,10 @@ impl RequestContextInner { } } + fn set_user_agent(&mut self, user_agent: Option) { + self.user_agent = user_agent; + } + fn set_dbname(&mut self, dbname: DbName) { self.dbname = Some(dbname); } @@ -393,6 +441,10 @@ impl RequestContextInner { self.user = Some(user); } + fn set_testodrome_id(&mut self, query_id: String) { + self.testodrome_query_id = Some(query_id); + } + fn has_private_peer_addr(&self) -> bool { match self.conn_info.addr.ip() { IpAddr::V4(ip) => ip.is_private(), diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index f029327266..bfab5f34f9 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -82,6 +82,7 @@ pub(crate) struct RequestData { peer_addr: String, username: Option, application_name: Option, + user_agent: Option, endpoint_id: Option, database: Option, project: Option, @@ -128,6 +129,7 @@ impl From<&RequestContextInner> for RequestData { timestamp: value.first_packet.naive_utc(), username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), + user_agent: value.user_agent.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), @@ -522,6 +524,7 @@ mod tests { .unwrap() .naive_utc(), application_name: Some("test".to_owned()), + user_agent: Some("test-user-agent".to_owned()), username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), @@ -610,15 +613,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -650,11 +653,11 @@ mod tests { assert_eq!( file_stats, [ - (1204324, 5, 10000), - (1204048, 5, 10000), - (1204349, 5, 10000), - (1204334, 5, 10000), - (1204588, 5, 10000) + (1205810, 5, 10000), + (1205534, 5, 10000), + (1205835, 5, 10000), + (1205820, 5, 10000), + (1206074, 5, 10000) ] ); @@ -679,15 +682,15 @@ mod tests { assert_eq!( file_stats, [ - (1313105, 3, 6000), - (1313094, 3, 6000), - (1313153, 3, 6000), - (1313110, 3, 6000), - (1313246, 3, 6000), - (1313083, 3, 6000), - (1312877, 3, 6000), - (1313112, 3, 6000), - (438020, 1, 2000) + (1313953, 3, 6000), + (1313942, 3, 6000), + (1314001, 3, 6000), + (1313958, 3, 6000), + (1314094, 3, 6000), + (1313931, 3, 6000), + (1313725, 3, 6000), + (1313960, 3, 6000), + (438318, 1, 2000) ] ); @@ -724,7 +727,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] + [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index 977fcf4727..2765aaa462 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -1,5 +1,7 @@ //! Production console backend. +use std::net::IpAddr; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -274,11 +276,27 @@ impl NeonControlPlaneClient { Some(x) => x, }; + let host_addr = IpAddr::from_str(host).ok(); + + let ssl_mode = match &body.server_name { + Some(_) => SslMode::Require, + None => SslMode::Disable, + }; + let host_name = match body.server_name { + Some(host) => host, + None => host.to_owned(), + }; + // Don't set anything but host and port! This config will be cached. // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(host.to_owned(), port); - config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + let mut config = compute::ConnCfg::new(host_name, port); + + if let Some(addr) = host_addr { + config.set_host_addr(addr); + } + + config.ssl_mode(ssl_mode); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 7da5464aa5..ee722e839e 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -1,5 +1,6 @@ //! Mock console backend which relies on a user-provided postgres instance. +use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; @@ -167,10 +168,22 @@ impl MockControlPlane { } async fn do_wake_compute(&self) -> Result { - let mut config = compute::ConnCfg::new( - self.endpoint.host_str().unwrap_or("localhost").to_owned(), - self.endpoint.port().unwrap_or(5432), - ); + let port = self.endpoint.port().unwrap_or(5432); + let mut config = match self.endpoint.host_str() { + None => { + let mut config = compute::ConnCfg::new("localhost".to_string(), port); + config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST)); + config + } + Some(host) => { + let mut config = compute::ConnCfg::new(host.to_string(), port); + if let Ok(addr) = IpAddr::from_str(host) { + config.set_host_addr(addr); + } + config + } + }; + config.ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { @@ -179,6 +192,7 @@ impl MockControlPlane { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 8d6b2e96f5..ec4554eab5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display}; use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use crate::auth::IpPattern; use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; @@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl { #[derive(Debug, Deserialize)] pub(crate) struct WakeCompute { pub(crate) address: Box, + pub(crate) server_name: Option, pub(crate) aux: MetricsAuxInfo, } @@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo { pub(crate) endpoint_id: EndpointIdInt, pub(crate) project_id: ProjectIdInt, pub(crate) branch_id: BranchIdInt, + // note: we don't use interned strings for compute IDs. + // they churn too quickly and we have no way to clean up interned strings. + pub(crate) compute_id: SmolStr, #[serde(default)] pub(crate) cold_start_info: ColdStartInfo, } @@ -378,6 +383,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "compute_id": "compute", "cold_start_info": "unknown", }) } diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3c34918d84..6f9845fd6e 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,9 +1,11 @@ use std::cell::{Cell, RefCell}; use std::collections::HashMap; use std::hash::BuildHasher; -use std::{env, io}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::{array, env, fmt, io}; use chrono::{DateTime, Utc}; +use indexmap::IndexSet; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; @@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use try_lock::TryLock; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -46,13 +49,13 @@ pub async fn init() -> anyhow::Result { let otlp_layer = tracing_utils::init_tracing("proxy").await; let json_log_layer = if logfmt == LogFormat::Json { - Some(JsonLoggingLayer { - clock: RealClock, - skipped_field_indices: papaya::HashMap::default(), - writer: StderrWriter { + Some(JsonLoggingLayer::new( + RealClock, + StderrWriter { stderr: std::io::stderr(), }, - }) + ["request_id", "session_id", "conn_id"], + )) } else { None }; @@ -191,13 +194,39 @@ thread_local! { } /// Implements tracing layer to handle events specific to logging. -struct JsonLoggingLayer { +struct JsonLoggingLayer { clock: C, skipped_field_indices: papaya::HashMap, + callsite_ids: papaya::HashMap, writer: W, + // We use a const generic and arrays to bypass one heap allocation. + extract_fields: IndexSet<&'static str>, + _marker: std::marker::PhantomData<[&'static str; F]>, } -impl Layer for JsonLoggingLayer +impl JsonLoggingLayer { + fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self { + JsonLoggingLayer { + clock, + skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), + writer, + extract_fields: IndexSet::from_iter(extract_fields), + _marker: std::marker::PhantomData, + } + } + + #[inline] + fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId { + *self + .callsite_ids + .pin() + .get_or_insert_with(cs, CallsiteId::next) + } +} + +impl Layer + for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -211,7 +240,14 @@ where let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { if entered.get() { let mut formatter = EventFormatter::new(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) } else { entered.set(true); @@ -219,7 +255,14 @@ where EVENT_FORMATTER.with_borrow_mut(move |formatter| { formatter.reset(); - formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + formatter.format::( + now, + event, + &ctx, + &self.skipped_field_indices, + &self.callsite_ids, + &self.extract_fields, + )?; self.writer.make_writer().write_all(formatter.buffer()) }) } @@ -246,10 +289,13 @@ where let span = ctx.span(id).expect("span must exist"); let fields = SpanFields::default(); fields.record_fields(attrs); + // This could deadlock when there's a panic somewhere in the tracing // event handling and a read or write guard is still held. This includes // the OTel subscriber. - span.extensions_mut().insert(fields); + let mut exts = span.extensions_mut(); + + exts.insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { @@ -265,6 +311,7 @@ where /// wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { if !metadata.is_event() { + self.callsite_id(metadata.callsite()); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } @@ -297,6 +344,26 @@ where } } +#[derive(Copy, Clone, Debug, Default)] +#[repr(transparent)] +struct CallsiteId(u32); + +impl CallsiteId { + #[inline] + fn next() -> Self { + // Start at 1 to reserve 0 for default. + static COUNTER: AtomicU32 = AtomicU32::new(1); + CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed)) + } +} + +impl fmt::Display for CallsiteId { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Stores span field values recorded during the spans lifetime. #[derive(Default)] struct SpanFields { @@ -448,12 +515,14 @@ impl EventFormatter { self.logline_buffer.clear(); } - fn format( + fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, skipped_field_indices: &papaya::HashMap, + callsite_ids: &papaya::HashMap, + extract_fields: &IndexSet<&'static str>, ) -> io::Result<()> where S: Subscriber + for<'a> LookupSpan<'a>, @@ -485,6 +554,7 @@ impl EventFormatter { event.record(&mut message_extractor); let mut serializer = message_extractor.into_serializer()?; + // Direct message fields. let mut fields_present = FieldsPresent(false, skipped_field_indices); event.record(&mut fields_present); if fields_present.0 { @@ -494,7 +564,16 @@ impl EventFormatter { )?; } + let spans = SerializableSpans { + ctx, + callsite_ids, + extract: ExtractedSpanFields::<'_, F>::new(extract_fields), + }; + serializer.serialize_entry("spans", &spans)?; + + // TODO: thread-local cache? let pid = std::process::id(); + // Skip adding pid 1 to reduce noise for services running in containers. if pid != 1 { serializer.serialize_entry("process_id", &pid)?; } @@ -514,6 +593,7 @@ impl EventFormatter { serializer.serialize_entry("target", meta.target())?; + // Skip adding module if it's the same as target. if let Some(module) = meta.module_path() { if module != meta.target() { serializer.serialize_entry("module", module)?; @@ -540,7 +620,10 @@ impl EventFormatter { } } - serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?; + if spans.extract.has_values() { + // TODO: add fields from event, too? + serializer.serialize_entry("extract", &spans.extract)?; + } serializer.end() }; @@ -818,15 +901,20 @@ impl tracing::field::Visit for MessageFieldSkipper< } } -/// Serializes the span stack from root to leaf (parent of event) enumerated -/// inside an object where the keys are just the number padded with zeroes -/// to retain sorting order. -// The object is necessary because Loki cannot flatten arrays. -struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>) +/// Serializes the span stack from root to leaf (parent of event) as object +/// with the span names as keys. To prevent collision we append a numberic value +/// to the name. Also, collects any span fields we're interested in. Last one +/// wins. +struct SerializableSpans<'a, 'ctx, Span, const F: usize> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>; + Span: Subscriber + for<'lookup> LookupSpan<'lookup>, +{ + ctx: &'a Context<'ctx, Span>, + callsite_ids: &'a papaya::HashMap, + extract: ExtractedSpanFields<'a, F>, +} -impl serde::ser::Serialize for SerializableSpanStack<'_, '_, Span> +impl serde::ser::Serialize for SerializableSpans<'_, '_, Span, F> where Span: Subscriber + for<'lookup> LookupSpan<'lookup>, { @@ -836,9 +924,24 @@ where { let mut serializer = serializer.serialize_map(None)?; - if let Some(leaf_span) = self.0.lookup_current() { - for (i, span) in leaf_span.scope().from_root().enumerate() { - serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?; + if let Some(leaf_span) = self.ctx.lookup_current() { + for span in leaf_span.scope().from_root() { + // Append a numeric callsite ID to the span name to keep the name unique + // in the JSON object. + let cid = self + .callsite_ids + .pin() + .get(&span.metadata().callsite()) + .copied() + .unwrap_or_default(); + + // Loki turns the # into an underscore during field name concatenation. + serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?; + + serializer.serialize_value(&SerializableSpanFields { + span: &span, + extract: &self.extract, + })?; } } @@ -846,28 +949,79 @@ where } } -/// Serializes a single span. Include the span ID, name and its fields as -/// recorded up to this point. -struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>) -where - Span: for<'lookup> LookupSpan<'lookup>; - -impl serde::ser::Serialize for SerializableSpan<'_, '_, Span> +/// Serializes the span fields as object. +struct SerializableSpanFields<'a, 'span, Span, const F: usize> where Span: for<'lookup> LookupSpan<'lookup>, { - fn serialize(&self, serializer: Ser) -> Result + span: &'a SpanRef<'span, Span>, + extract: &'a ExtractedSpanFields<'a, F>, +} + +impl serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F> +where + Span: for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: S) -> Result where - Ser: serde::ser::Serializer, + S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - // TODO: the span ID is probably only useful for debugging tracing. - serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?; - serializer.serialize_entry("span_name", self.0.metadata().name())?; - let ext = self.0.extensions(); + let ext = self.span.extensions(); if let Some(data) = ext.get::() { - for (key, value) in &data.fields.pin() { + for (name, value) in &data.fields.pin() { + serializer.serialize_entry(name, value)?; + // TODO: replace clone with reference, if possible. + self.extract.set(name, value.clone()); + } + } + + serializer.end() + } +} + +struct ExtractedSpanFields<'a, const F: usize> { + names: &'a IndexSet<&'static str>, + // TODO: replace TryLock with something local thread and interior mutability. + // serde API doesn't let us use `mut`. + values: TryLock<([Option; F], bool)>, +} + +impl<'a, const F: usize> ExtractedSpanFields<'a, F> { + fn new(names: &'a IndexSet<&'static str>) -> Self { + ExtractedSpanFields { + names, + values: TryLock::new((array::from_fn(|_| Option::default()), false)), + } + } + + #[inline] + fn set(&self, name: &'static str, value: serde_json::Value) { + if let Some((index, _)) = self.names.get_full(name) { + let mut fields = self.values.try_lock().expect("thread-local use"); + fields.0[index] = Some(value); + fields.1 = true; + } + } + + #[inline] + fn has_values(&self) -> bool { + self.values.try_lock().expect("thread-local use").1 + } +} + +impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + + let values = self.values.try_lock().expect("thread-local use"); + for (i, value) in values.0.iter().enumerate() { + if let Some(value) = value { + let key = self.names[i]; serializer.serialize_entry(key, value)?; } } @@ -879,6 +1033,7 @@ where #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { + use std::marker::PhantomData; use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; @@ -927,14 +1082,17 @@ mod tests { let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), + callsite_ids: papaya::HashMap::default(), writer: buffer.clone(), + extract_fields: IndexSet::from_iter(["x"]), + _marker: PhantomData::<[&'static str; 1]>, }; let registry = tracing_subscriber::Registry::default().with(log_layer); tracing::subscriber::with_default(registry, || { - info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| { - info_span!("span2").in_scope(|| { + info_span!("some_span", x = 24).in_scope(|| { + info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| { tracing::error!( a = 1, a = 2, @@ -960,16 +1118,16 @@ mod tests { "a": 3, }, "spans": { - "00":{ - "span_id": "0000000000000001", - "span_name": "span1", - "x": 42, + "some_span#1":{ + "x": 24, }, - "01": { - "span_id": "0000000000000002", - "span_name": "span2", + "some_span#2": { + "x": 42, } }, + "extract": { + "x": 42, + }, "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), "target": "proxy::logging::tests", "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index db1f096de1..29834760c0 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -394,21 +394,34 @@ pub enum RedisMsgKind { HDel, } -#[derive(Default)] -struct Accumulated { +#[derive(Default, Clone)] +pub struct LatencyAccumulated { cplane: time::Duration, client: time::Duration, compute: time::Duration, retry: time::Duration, } +impl std::fmt::Display for LatencyAccumulated { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "client: {}, cplane: {}, compute: {}, retry: {}", + self.client.as_micros(), + self.cplane.as_micros(), + self.compute.as_micros(), + self.retry.as_micros() + ) + } +} + pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, // time since the stopwatch was stopped stop: Option, // accumulated time on the stopwatch - accumulated: Accumulated, + accumulated: LatencyAccumulated, // label data protocol: Protocol, cold_start_info: ColdStartInfo, @@ -422,7 +435,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -435,7 +448,7 @@ impl LatencyTimer { Self { start: time::Instant::now(), stop: None, - accumulated: Accumulated::default(), + accumulated: LatencyAccumulated::default(), protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified @@ -465,6 +478,10 @@ impl LatencyTimer { // success self.outcome = ConnectOutcome::Success; } + + pub fn accumulated(&self) -> LatencyAccumulated { + self.accumulated.clone() + } } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] @@ -511,7 +528,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue communication from the accumulated time. + // Exclude client, cplane, compute communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; metric.observe( @@ -524,7 +541,7 @@ impl Drop for LatencyTimer { duration.saturating_sub(accumulated_total).as_secs_f64(), ); - // Exclude client cplane, compue, retry communication from the accumulated time. + // Exclude client, cplane, compute, retry communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane + self.accumulated.compute diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index b8b39fa121..e013fbbe2e 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty + ))] async fn connect_once( &self, ctx: &RequestContext, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 171f539b1e..e0b7539538 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -555,6 +555,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 72029102e0..b55661cec8 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,4 +1,5 @@ use std::io; +use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use std::time::Duration; @@ -6,11 +7,15 @@ use async_trait::async_trait; use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use jose_jwk::jose_b64; +use postgres_client::config::SslMode; use rand::rngs::OsRng; +use rustls::pki_types::{DnsName, ServerName}; use tokio::net::{TcpStream, lookup_host}; +use tokio_rustls::TlsConnector; use tracing::field::display; use tracing::{debug, info}; +use super::AsyncRW; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; @@ -190,7 +195,11 @@ impl PoolingBackend { // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_compute( &self, ctx: &RequestContext, @@ -229,7 +238,10 @@ impl PoolingBackend { } // Wake up the destination if needed - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + compute_id = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_proxy( &self, ctx: &RequestContext, @@ -276,7 +288,10 @@ impl PoolingBackend { /// # Panics /// /// Panics if called with a non-local_proxy backend. - #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + #[tracing::instrument(skip_all, fields( + pid = tracing::field::Empty, + conn_id = tracing::field::Empty, + ))] pub(crate) async fn connect_to_local_postgres( &self, ctx: &RequestContext, @@ -552,6 +567,10 @@ impl ConnectMechanism for TokioMechanism { let (client, connection) = permit.release_result(res)?; tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); Ok(poll_client( self.pool.clone(), ctx, @@ -587,16 +606,28 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result { + let host_addr = node_info.config.get_host_addr(); let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let tls = if node_info.config.get_ssl_mode() == SslMode::Disable { + None + } else { + Some(&config.tls) + }; + let port = node_info.config.get_port(); - let res = connect_http2(&host, port, config.timeout).await; + let res = connect_http2(host_addr, &host, port, config.timeout, tls).await; drop(pause); let (client, connection) = permit.release_result(res)?; + tracing::Span::current().record( + "compute_id", + tracing::field::display(&node_info.aux.compute_id), + ); + Ok(poll_http2_client( self.pool.clone(), ctx, @@ -612,18 +643,22 @@ impl ConnectMechanism for HyperMechanism { } async fn connect_http2( + host_addr: Option, host: &str, port: u16, timeout: Duration, + tls: Option<&Arc>, ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> { - // assumption: host is an ip address so this should not actually perform any requests. - // todo: add that assumption as a guarantee in the control-plane API. - let mut addrs = lookup_host((host, port)) - .await - .map_err(LocalProxyConnError::Io)?; - + let addrs = match host_addr { + Some(addr) => vec![SocketAddr::new(addr, port)], + None => lookup_host((host, port)) + .await + .map_err(LocalProxyConnError::Io)? + .collect(), + }; let mut last_err = None; + let mut addrs = addrs.into_iter(); let stream = loop { let Some(addr) = addrs.next() else { return Err(last_err.unwrap_or_else(|| { @@ -651,6 +686,20 @@ async fn connect_http2( } }; + let stream = if let Some(tls) = tls { + let host = DnsName::try_from(host) + .map_err(io::Error::other) + .map_err(LocalProxyConnError::Io)? + .to_owned(); + let stream = TlsConnector::from(tls.clone()) + .connect(ServerName::DnsName(host), stream) + .await + .map_err(LocalProxyConnError::Io)?; + Box::pin(stream) as AsyncRW + } else { + Box::pin(stream) as AsyncRW + }; + let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) .timer(TokioTimer::new()) .keep_alive_interval(Duration::from_secs(20)) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 6a9089fc2a..516d474a11 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -221,6 +221,7 @@ mod tests { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), branch_id: (&BranchId::from("branch")).into(), + compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, conn_id: uuid::Uuid::new_v4(), diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 338a79b4b3..bca2d4c165 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -6,9 +6,9 @@ use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use smol_str::ToSmolStr; -use tokio::net::TcpStream; use tracing::{Instrument, debug, error, info, info_span}; +use super::AsyncRW; use super::backend::HttpConnError; use super::conn_pool_lib::{ ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry, @@ -22,8 +22,7 @@ use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; -pub(crate) type Connect = - http2::Connection, hyper::body::Incoming, TokioExecutor>; +pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index dd0fb9c5b4..acd6a05718 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -438,6 +438,14 @@ async fn request_handler( &config.region, ); + ctx.set_user_agent( + request + .headers() + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let span = ctx.span(); info!(parent: &span, "performing websocket upgrade"); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 8babfb5cd2..93dd531f70 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -228,6 +228,13 @@ fn get_conn_info( } } + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + let user_info = ComputeUserInfo { endpoint, user: username, diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs index a2d695aae1..ce873e678e 100644 --- a/proxy/src/tls/client_config.rs +++ b/proxy/src/tls/client_config.rs @@ -1,17 +1,49 @@ +use std::env; +use std::io::Cursor; +use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; +use anyhow::{Context, bail}; use rustls::crypto::ring; -pub(crate) fn load_certs() -> anyhow::Result> { +/// We use an internal certificate authority when establishing a TLS connection with compute. +fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { + let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else { + return Ok(()); + }; + let ca_file = PathBuf::from(ca_file); + + let ca = std::fs::read(&ca_file) + .with_context(|| format!("could not read CA from {}", ca_file.display()))?; + + for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) { + store + .add(cert.context("could not parse internal CA certificate")?) + .context("could not parse internal CA certificate")?; + } + + Ok(()) +} + +/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router. +/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we +/// load certificates from our native store. +fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> { let der_certs = rustls_native_certs::load_native_certs(); if !der_certs.errors.is_empty() { bail!("could not parse certificates: {:?}", der_certs.errors); } - let mut store = rustls::RootCertStore::empty(); store.add_parsable_certificates(der_certs.certs); + + Ok(()) +} + +fn load_compute_certs() -> anyhow::Result> { + let mut store = rustls::RootCertStore::empty(); + load_native_certs(&mut store)?; + load_internal_certs(&mut store)?; Ok(Arc::new(store)) } @@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result = std::result::Result; @@ -65,11 +68,7 @@ impl ResponseErrorMessageExt for reqwest::Response { } impl Client { - pub fn new(mgmt_api_endpoint: String, jwt: Option) -> Self { - Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) - } - - pub fn from_client( + pub fn new( client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option, @@ -173,12 +172,10 @@ impl Client { uri: U, body: B, ) -> Result { - let req = self.client.request(method, uri); - let req = if let Some(value) = &self.authorization_header { - req.header(reqwest::header::AUTHORIZATION, value.get_contents()) - } else { - req - }; + let mut req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req = req.header(reqwest::header::AUTHORIZATION, value.get_contents()) + } req.json(&body).send().await.map_err(Error::ReceiveBody) } } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index dd7008c87d..5ca3d1b7c2 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -23,7 +23,6 @@ use utils::postgres_client::PostgresClientProtocol; use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; -use crate::json_ctrl::{AppendLogicalMessage, handle_json_ctrl}; use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics}; use crate::timeline::TimelineError; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -62,9 +61,6 @@ enum SafekeeperPostgresCommand { }, IdentifySystem, TimelineStatus, - JSONCtrl { - cmd: AppendLogicalMessage, - }, } fn parse_cmd(cmd: &str) -> anyhow::Result { @@ -134,11 +130,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::IdentifySystem) } else if cmd.starts_with("TIMELINE_STATUS") { Ok(SafekeeperPostgresCommand::TimelineStatus) - } else if cmd.starts_with("JSON_CTRL") { - let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; - Ok(SafekeeperPostgresCommand::JSONCtrl { - cmd: serde_json::from_str(cmd)?, - }) } else { anyhow::bail!("unsupported command {cmd}"); } @@ -150,7 +141,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", - SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", } } @@ -359,9 +349,6 @@ impl postgres_backend::Handler } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } } }) } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 3b3bc71ac4..4f47331c85 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -19,7 +19,7 @@ use safekeeper_api::models::{ AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, }; -use safekeeper_api::{ServerInfo, models}; +use safekeeper_api::{ServerInfo, membership, models}; use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::sync::mpsc; use tokio::task; @@ -32,7 +32,7 @@ use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::timelines_global_map::{DeleteOrExclude, TimelineDeleteResult}; use crate::{ GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, }; @@ -73,10 +73,13 @@ async fn tenant_delete_handler(mut request: Request) -> Result) -> Result>(), + .collect::>(), ) } @@ -208,12 +211,15 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result for ApiError { + fn from(de: DeleteOrExcludeError) -> ApiError { + match de { + DeleteOrExcludeError::Conflict { + requested: _, + current: _, + } => ApiError::Conflict(de.to_string()), + DeleteOrExcludeError::Other(e) => ApiError::InternalServerError(e), + } + } +} + +/// Remove timeline locally after this node has been excluded from the +/// membership configuration. The body is the same as in the membership endpoint +/// -- conf where node is excluded -- and in principle single ep could be used +/// for both actions, but since this is a data deletion op let's keep them +/// separate. +async fn timeline_exclude_handler(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request doesn't exclude us, membership switch endpoint should be used + // instead. + if data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is member of it", + data.mconf, my_id + ))); + } + let action = DeleteOrExclude::Exclude(data.mconf); + + let resp = global_timelines + .delete_or_exclude(&ttid, action) + .await + .map_err(ApiError::from)?; + json_response(StatusCode::OK, resp) +} + /// Consider switching timeline membership configuration to the provided one. async fn timeline_membership_handler( mut request: Request, @@ -281,12 +345,29 @@ async fn timeline_membership_handler( let tli = global_timelines.get(ttid).map_err(ApiError::from)?; let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let my_id = get_conf(&request).my_id; + // If request excludes us, exclude endpoint should be used instead. + if !data.mconf.contains(my_id) { + return Err(ApiError::Forbidden(format!( + "refused to switch into {}, node {} is not a member of it", + data.mconf, my_id + ))); + } + let req_gen = data.mconf.generation; let response = tli .membership_switch(data.mconf) .await .map_err(ApiError::InternalServerError)?; - json_response(StatusCode::OK, response) + // Return 409 if request was ignored. + if req_gen == response.current_conf.generation { + json_response(StatusCode::OK, response) + } else { + Err(ApiError::Conflict(format!( + "request to switch into {} ignored, current generation {}", + req_gen, response.current_conf.generation + ))) + } } async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { @@ -637,11 +718,14 @@ pub fn make_router( .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/exclude", |r| { + request_span(r, timeline_exclude_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) - .post( + .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", |r| request_span(r, timeline_membership_handler), ) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs deleted file mode 100644 index 793ea9c3e9..0000000000 --- a/safekeeper/src/json_ctrl.rs +++ /dev/null @@ -1,192 +0,0 @@ -//! -//! This module implements JSON_CTRL protocol, which allows exchange -//! JSON messages over psql for testing purposes. -//! -//! Currently supports AppendLogicalMessage, which is used for WAL -//! modifications in tests. -//! - -use anyhow::Context; -use postgres_backend::{PostgresBackend, QueryError}; -use postgres_ffi::{WAL_SEGMENT_SIZE, encode_logical_message}; -use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; -use safekeeper_api::{ServerInfo, Term}; -use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::*; -use utils::lsn::Lsn; - -use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{ - AcceptorProposerMessage, AppendRequest, AppendRequestHeader, AppendResponse, - ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, -}; -use crate::state::TimelinePersistentState; -use crate::timeline::WalResidentTimeline; - -#[derive(Serialize, Deserialize, Debug)] -pub struct AppendLogicalMessage { - // prefix and message to build LogicalMessage - pub lm_prefix: String, - pub lm_message: String, - - // if true, commit_lsn will match flush_lsn after append - pub set_commit_lsn: bool, - - // if true, ProposerElected will be sent before append - pub send_proposer_elected: bool, - - // fields from AppendRequestHeader - pub term: Term, - #[serde(with = "utils::lsn::serde_as_u64")] - pub epoch_start_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub begin_lsn: Lsn, - #[serde(with = "utils::lsn::serde_as_u64")] - pub truncate_lsn: Lsn, - pub pg_version: u32, -} - -#[derive(Debug, Serialize)] -struct AppendResult { - // safekeeper state after append - state: TimelinePersistentState, - // info about new record in the WAL - inserted_wal: InsertedWAL, -} - -/// Handles command to craft logical message WAL record with given -/// content, and then append it with specified term and lsn. This -/// function is used to test safekeepers in different scenarios. -pub async fn handle_json_ctrl( - spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - append_request: &AppendLogicalMessage, -) -> Result<(), QueryError> { - info!("JSON_CTRL request: {append_request:?}"); - - // need to init safekeeper state before AppendRequest - let tli = prepare_safekeeper(spg, append_request.pg_version).await?; - - // if send_proposer_elected is true, we need to update local history - if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; - } - - let inserted_wal = append_logical_message(&tli, append_request).await?; - let response = AppendResult { - state: tli.get_state().await.1, - inserted_wal, - }; - let response_data = serde_json::to_vec(&response) - .with_context(|| format!("Response {response:?} is not a json array"))?; - - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { - name: b"json", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }]))? - .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message_noflush(&BeMessage::CommandComplete(b"JSON_CTRL"))?; - Ok(()) -} - -/// Prepare safekeeper to process append requests without crashes, -/// by sending ProposerGreeting with default server.wal_seg_size. -async fn prepare_safekeeper( - spg: &SafekeeperPostgresHandler, - pg_version: u32, -) -> anyhow::Result { - let tli = spg - .global_timelines - .create( - spg.ttid, - Configuration::empty(), - ServerInfo { - pg_version, - wal_seg_size: WAL_SEGMENT_SIZE as u32, - system_id: 0, - }, - Lsn::INVALID, - Lsn::INVALID, - ) - .await?; - - tli.wal_residence_guard().await -} - -async fn send_proposer_elected( - tli: &WalResidentTimeline, - term: Term, - lsn: Lsn, -) -> anyhow::Result<()> { - // add new term to existing history - let history = tli.get_state().await.1.acceptor_state.term_history; - let history = history.up_to(lsn.checked_sub(1u64).unwrap()); - let mut history_entries = history.0; - history_entries.push(TermLsn { term, lsn }); - let history = TermHistory(history_entries); - - let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { - generation: INVALID_GENERATION, - term, - start_streaming_at: lsn, - term_history: history, - }); - - tli.process_msg(&proposer_elected_request).await?; - Ok(()) -} - -#[derive(Debug, Serialize)] -pub struct InsertedWAL { - begin_lsn: Lsn, - pub end_lsn: Lsn, - append_response: AppendResponse, -} - -/// Extend local WAL with new LogicalMessage record. To do that, -/// create AppendRequest with new WAL and pass it to safekeeper. -pub async fn append_logical_message( - tli: &WalResidentTimeline, - msg: &AppendLogicalMessage, -) -> anyhow::Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().await.1; - - let begin_lsn = msg.begin_lsn; - let end_lsn = begin_lsn + wal_data.len() as u64; - - let commit_lsn = if msg.set_commit_lsn { - end_lsn - } else { - sk_state.commit_lsn - }; - - let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { - h: AppendRequestHeader { - generation: INVALID_GENERATION, - term: msg.term, - begin_lsn, - end_lsn, - commit_lsn, - truncate_lsn: msg.truncate_lsn, - }, - wal_data, - }); - - let response = tli.process_msg(&append_request).await?; - - let append_response = match response { - Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, - _ => anyhow::bail!("not AppendResponse"), - }; - - Ok(InsertedWAL { - begin_lsn, - end_lsn, - append_response, - }) -} diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index c52b097066..de3b783508 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -21,7 +21,6 @@ pub mod copy_timeline; pub mod debug_dump; pub mod handler; pub mod http; -pub mod json_ctrl; pub mod metrics; pub mod patch_control_file; pub mod pull_timeline; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index fc58b8509a..7d6ce1269c 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -402,12 +402,16 @@ pub async fn handle_request( bail!("Timeline {} already exists", request.timeline_id); } + // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's + // part of https support (#24836). + let http_client = reqwest::Client::new(); + let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. let responses: Vec> = futures::future::join_all(http_hosts.iter().map(|url| async { - let cclient = Client::new(url.clone(), sk_auth_token.clone()); + let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone()); let info = cclient .timeline_status(request.tenant_id, request.timeline_id) .await?; @@ -460,8 +464,10 @@ async fn pull_timeline( let conf = &global_timelines.get_global_config(); let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - - let client = Client::new(host.clone(), sk_auth_token.clone()); + // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's + // part of https support (#24836). + let http_client = reqwest::Client::new(); + let client = Client::new(http_client, host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client .snapshot(status.tenant_id, status.timeline_id, conf.my_id) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 0edac04b97..886cac869d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -279,7 +279,7 @@ pub struct VoteResponse { * Proposer -> Acceptor message announcing proposer is elected and communicating * term history to it. */ -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ProposerElected { pub generation: Generation, // membership conf generation pub term: Term, @@ -1013,6 +1013,9 @@ where self.state.finish_change(&state).await?; } + // Switch into conf given by proposer conf if it is higher. + self.state.membership_switch(msg.mconf.clone()).await?; + let apg = AcceptorGreeting { node_id: self.node_id, mconf: self.state.mconf.clone(), @@ -1030,16 +1033,18 @@ where &mut self, msg: &VoteRequest, ) -> Result> { + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } // Once voted, we won't accept data from older proposers; flush // everything we've already received so that new proposer starts - // streaming at end of our WAL, without overlap. Currently we truncate - // WAL at streaming point, so this avoids truncating already committed - // WAL. - // - // TODO: it would be smoother to not truncate committed piece at - // handle_elected instead. Currently not a big deal, as proposer is the - // only source of WAL; with peer2peer recovery it would be more - // important. + // streaming at end of our WAL, without overlap. WAL is truncated at + // streaming point and commit_lsn may be advanced from peers, so this + // also avoids possible spurious attempt to truncate committed WAL. self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { @@ -1093,6 +1098,13 @@ where self.get_last_log_term(), self.flush_lsn() ); + if self.state.mconf.generation != msg.generation { + bail!( + "refusing {:?} due to generation mismatch: sk generation {}", + msg, + self.state.mconf.generation + ); + } if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -1263,11 +1275,24 @@ where msg: &AppendRequest, require_flush: bool, ) -> Result> { + // Refuse message on generation mismatch. On reconnect wp will get full + // configuration from greeting. + if self.state.mconf.generation != msg.h.generation { + bail!( + "refusing append request due to generation mismatch: request {}, sk {}", + msg.h.generation, + self.state.mconf.generation + ); + } + if self.state.acceptor_state.term < msg.h.term { bail!("got AppendRequest before ProposerElected"); } - // If our term is higher, immediately refuse the message. + // If our term is higher, immediately refuse the message. Send term only + // response; elected walproposer can never advance the term, so it will + // figure out the refusal from it -- which is important as term change + // should cause not just reconnection but whole walproposer re-election. if self.state.acceptor_state.term > msg.h.term { let resp = AppendResponse::term_only( self.state.mconf.generation, @@ -1468,6 +1493,13 @@ mod tests { let wal_store = DummyWalStore { lsn: Lsn(0) }; let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + // Vote with generation mismatch should be rejected. + let gen_mismatch_vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: SafekeeperGeneration::new(42), + term: 1, + }); + assert!(sk.process_msg(&gen_mismatch_vote_request).await.is_err()); + // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { generation: Generation::new(0), @@ -1532,6 +1564,16 @@ mod tests { }, ]), }; + + // check that elected msg with generation mismatch is rejected + let mut pem_gen_mismatch = pem.clone(); + pem_gen_mismatch.generation = SafekeeperGeneration::new(42); + assert!( + sk.process_msg(&ProposerAcceptorMessage::Elected(pem_gen_mismatch)) + .await + .is_err() + ); + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); @@ -1590,6 +1632,21 @@ mod tests { wal_data: Bytes::from_static(b"b"), }; + // check that append request with generation mismatch is rejected + let mut ar_hdr_gen_mismatch = ar_hdr.clone(); + ar_hdr_gen_mismatch.generation = SafekeeperGeneration::new(42); + let append_request_gen_mismatch = AppendRequest { + h: ar_hdr_gen_mismatch, + wal_data: Bytes::from_static(b"b"), + }; + assert!( + sk.process_msg(&ProposerAcceptorMessage::AppendRequest( + append_request_gen_mismatch + )) + .await + .is_err() + ); + // do write ending at 2, it should be ok sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) .await diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index be0c849a5f..2b1fd7b854 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -184,6 +184,16 @@ impl InterpretedWalReaderState { to: *current_position, } } else { + // Edge case: The new shard is at the same current position as + // the reader. Note that the current position is WAL record aligned, + // so the reader might have done some partial reads and updated the + // batch start. If that's the case, adjust the batch start to match + // starting position of the new shard. It can lead to some shards + // seeing overlaps, but in that case the actual record LSNs are checked + // which should be fine based on the filtering logic. + if let Some(start) = current_batch_wal_start { + *start = std::cmp::min(*start, new_shard_start_pos); + } CurrentPositionUpdate::NotReset(*current_position) } } @@ -209,12 +219,12 @@ impl InterpretedWalReaderState { } } - fn take_current_batch_wal_start(&mut self) -> Lsn { + fn replace_current_batch_wal_start(&mut self, with: Lsn) -> Lsn { match self { InterpretedWalReaderState::Running { current_batch_wal_start, .. - } => current_batch_wal_start.take().unwrap(), + } => current_batch_wal_start.replace(with).unwrap(), InterpretedWalReaderState::Done => { panic!("take_current_batch_wal_start called on finished reader") } @@ -287,7 +297,13 @@ impl InterpretedWalReader { reader .run_impl(start_pos) .await - .inspect_err(|err| critical!("failed to read WAL record: {err:?}")) + .inspect_err(|err| match err { + // TODO: we may want to differentiate these errors further. + InterpretedWalReaderError::Decode(_) => { + critical!("failed to decode WAL record: {err:?}"); + } + err => error!("failed to read WAL record: {err}"), + }) } .instrument(info_span!("interpreted wal reader")), ); @@ -347,10 +363,12 @@ impl InterpretedWalReader { metric.dec(); } - if let Err(err) = self.run_impl(start_pos).await { - critical!("failed to read WAL record: {err:?}"); - } else { - info!("interpreted wal reader exiting"); + match self.run_impl(start_pos).await { + Err(err @ InterpretedWalReaderError::Decode(_)) => { + critical!("failed to decode WAL record: {err:?}"); + } + Err(err) => error!("failed to read WAL record: {err}"), + Ok(()) => info!("interpreted wal reader exiting"), } Err(CopyStreamHandlerEnd::Other(anyhow!( @@ -398,10 +416,12 @@ impl InterpretedWalReader { let shard_ids = self.shard_senders.keys().copied().collect::>(); let mut records_by_sender: HashMap> = HashMap::new(); let mut max_next_record_lsn = None; + let mut max_end_record_lsn = None; while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()? { assert!(next_record_lsn.is_aligned()); max_next_record_lsn = Some(next_record_lsn); + max_end_record_lsn = Some(wal_decoder.lsn()); let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, @@ -412,7 +432,10 @@ impl InterpretedWalReader { .with_context(|| "Failed to interpret WAL")?; for (shard, record) in interpreted { - if record.is_empty() { + // Shard zero needs to track the start LSN of the latest record + // in adition to the LSN of the next record to ingest. The former + // is included in basebackup persisted by the compute in WAL. + if !shard.is_shard_zero() && record.is_empty() { continue; } @@ -449,7 +472,7 @@ impl InterpretedWalReader { let batch_wal_start_lsn = { let mut guard = self.state.write().unwrap(); guard.update_current_position(max_next_record_lsn); - guard.take_current_batch_wal_start() + guard.replace_current_batch_wal_start(max_end_record_lsn.unwrap()) }; // Send interpreted records downstream. Anything that has already been seen @@ -722,7 +745,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); @@ -865,10 +888,16 @@ mod tests { let resident_tli = tli.wal_residence_guard().await.unwrap(); let mut next_record_lsns = Vec::default(); - let end_watch = - Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) - .await - .unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); let end_pos = end_watch.get(); let streaming_wal_reader = StreamingWalReader::new( @@ -1001,17 +1030,28 @@ mod tests { const WAL_READER_BATCH_SIZE: usize = 8192; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); - let shard_0_start_lsn = Lsn::from_str("0/14AFE10").unwrap(); let env = Env::new(true).unwrap(); + let mut next_record_lsns = Vec::default(); let tli = env .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) .await .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) - .await - .unwrap(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + c"neon-file:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); + + assert!(next_record_lsns.len() > 3); + let shard_0_start_lsn = next_record_lsns[3]; + let end_pos = end_watch.get(); let streaming_wal_reader = StreamingWalReader::new( @@ -1064,7 +1104,7 @@ mod tests { ); let reader_state = reader.state(); - let mut reader_fut = std::pin::pin!(reader.run(start_lsn, &None)); + let mut reader_fut = std::pin::pin!(reader.run(shard_0_start_lsn, &None)); loop { let poll = futures::poll!(reader_fut.as_mut()); assert!(poll.is_pending()); @@ -1101,4 +1141,88 @@ mod tests { } } } + + #[tokio::test] + async fn test_shard_zero_does_not_skip_empty_records() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let mut next_record_lsns = Vec::new(); + let end_watch = Env::write_wal( + tli, + start_lsn, + SIZE, + MSG_COUNT, + // This is a logical message prefix that is not persisted to key value storage. + // We use it in order to validate that shard zero receives emtpy interpreted records. + c"test:", + Some(&mut next_record_lsns), + ) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard = ShardIdentity::unsharded(); + let (records_tx, mut records_rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + records_tx, + shard, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + let mut interpreted_records = Vec::new(); + while let Some(batch) = records_rx.recv().await { + interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + let received_next_record_lsns = interpreted_records + .into_iter() + .flat_map(|b| b.records) + .map(|rec| rec.next_record_lsn) + .collect::>(); + + // By default this also includes the start LSN. Trim it since it shouldn't be received. + let next_record_lsns = next_record_lsns.into_iter().skip(1).collect::>(); + + assert_eq!(received_next_record_lsns, next_record_lsns); + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index e437e6d2cd..7533005c35 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -268,7 +268,7 @@ where // Is switch allowed? if to.generation <= self.mconf.generation { info!( - "ignoring request to switch membership conf to lower {}, current conf {}", + "ignoring request to switch membership conf to {}, current conf {}", to, self.mconf ); } else { diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index e6f74185c1..618e2b59d2 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,3 +1,4 @@ +use std::ffi::CStr; use std::sync::Arc; use camino_tempfile::Utf8TempDir; @@ -124,6 +125,7 @@ impl Env { start_lsn: Lsn, msg_size: usize, msg_count: usize, + prefix: &CStr, mut next_record_lsns: Option<&mut Vec>, ) -> anyhow::Result { let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); @@ -133,7 +135,6 @@ impl Env { WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); - let prefix = c"neon-file:"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(msg_size >= prefixlen); let message = vec![0; msg_size - prefixlen]; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c140f16ced..d3c841ec09 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -415,6 +415,9 @@ impl From for ApiError { } } +/// We run remote deletion in a background task, this is how it sends its results back. +type RemoteDeletionReceiver = tokio::sync::watch::Receiver>>; + /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { @@ -446,6 +449,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + remote_deletion: std::sync::Mutex>, + /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding /// this gate, you must respect [`Timeline::cancel`] pub(crate) gate: Gate, @@ -494,6 +499,7 @@ impl Timeline { walreceivers, gate: Default::default(), cancel: CancellationToken::default(), + remote_deletion: std::sync::Mutex::new(None), manager_ctl: ManagerCtl::new(), conf, broker_active: AtomicBool::new(false), @@ -558,11 +564,18 @@ impl Timeline { }); } - /// Background timeline activities (which hold Timeline::gate) will no - /// longer run once this function completes. - pub async fn shutdown(&self) { + /// Cancel the timeline, requesting background activity to stop. Closing + /// the `self.gate` waits for that. + pub async fn cancel(&self) { info!("timeline {} shutting down", self.ttid); self.cancel.cancel(); + } + + /// Background timeline activities (which hold Timeline::gate) will no + /// longer run once this function completes. `Self::cancel` must have been + /// already called. + pub async fn close(&self) { + assert!(self.cancel.is_cancelled()); // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts // to read deleted files. @@ -574,13 +587,13 @@ impl Timeline { /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but /// deletion API endpoint is retriable. /// - /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first) + /// Timeline must be in shut-down state (i.e. call [`Self::close`] first) pub async fn delete( &self, shared_state: &mut WriteGuardSharedState<'_>, only_local: bool, ) -> Result { - // Assert that [`Self::shutdown`] was already called + // Assert that [`Self::close`] was already called assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); @@ -591,15 +604,95 @@ impl Timeline { shared_state.sk.close_wal_store(); if !only_local && self.conf.is_wal_backup_enabled() { - // Note: we concurrently delete remote storage data from multiple - // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we - // do some retries anyway. - wal_backup::delete_timeline(&self.ttid).await?; + self.remote_delete().await?; } let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } + /// Delete timeline content from remote storage. If the returned future is dropped, + /// deletion will continue in the background. + /// + /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`]. If + /// deletion is already happening, it may simply wait for an existing task's result. + /// + /// Note: we concurrently delete remote storage data from multiple + /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we + /// do some retries anyway. + async fn remote_delete(&self) -> Result<()> { + // We will start a background task to do the deletion, so that it proceeds even if our + // API request is dropped. Future requests will see the existing deletion task and wait + // for it to complete. + let mut result_rx = { + let mut remote_deletion_state = self.remote_deletion.lock().unwrap(); + let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() { + if let Some(result) = result_rx.borrow().as_ref() { + if let Err(e) = result { + // A previous remote deletion failed: we will start a new one + tracing::error!("remote deletion failed, will retry ({e})"); + None + } else { + // A previous remote deletion call already succeeded + return Ok(()); + } + } else { + // Remote deletion is still in flight + Some(result_rx.clone()) + } + } else { + // Remote deletion was not attempted yet, start it now. + None + }; + + match result_rx { + Some(result_rx) => result_rx, + None => self.start_remote_delete(&mut remote_deletion_state), + } + }; + + // Wait for a result + let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else { + // Unexpected: sender should always send a result before dropping the channel, even if it has an error + return Err(anyhow::anyhow!( + "remote deletion task future was dropped without sending a result" + )); + }; + + result + .as_ref() + .expect("We did a wait_for on this being Some above") + .as_ref() + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}")) + } + + /// Spawn background task to do remote deletion, return a receiver for its outcome + fn start_remote_delete( + &self, + guard: &mut std::sync::MutexGuard>, + ) -> RemoteDeletionReceiver { + tracing::info!("starting remote deletion"); + let (result_tx, result_rx) = tokio::sync::watch::channel(None); + let ttid = self.ttid; + tokio::task::spawn( + async move { + let r = wal_backup::delete_timeline(&ttid).await; + if let Err(e) = &r { + // Log error here in case nobody ever listens for our result (e.g. dropped API request) + tracing::error!("remote deletion failed: {e}"); + } + + // Ignore send results: it's legal for the Timeline to give up waiting for us. + let _ = result_tx.send(Some(r)); + } + .instrument(info_span!("remote_delete", timeline = %self.ttid)), + ); + + **guard = Some(result_rx.clone()); + + result_rx + } + /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { self.cancel.is_cancelled() @@ -1106,7 +1199,7 @@ impl ManagerTimeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -async fn delete_dir(path: &Utf8PathBuf) -> Result { +pub async fn delete_dir(path: &Utf8PathBuf) -> Result { match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 1d29030711..858dfce807 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -4,16 +4,15 @@ use std::collections::HashMap; use std::str::FromStr; -use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; -use safekeeper_api::ServerInfo; use safekeeper_api::membership::Configuration; use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_api::{ServerInfo, membership}; use serde::Serialize; use tokio::fs; use tracing::*; @@ -22,9 +21,10 @@ use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::http::routes::DeleteOrExcludeError; use crate::rate_limit::RateLimiter; use crate::state::TimelinePersistentState; -use crate::timeline::{Timeline, TimelineError, get_tenant_dir, get_timeline_dir}; +use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; use crate::timelines_set::TimelinesSet; use crate::wal_storage::Storage; use crate::{SafeKeeperConf, control_file, wal_storage}; @@ -448,23 +448,20 @@ impl GlobalTimelines { .collect() } - /// Cancels timeline, then deletes the corresponding data directory. - /// If only_local, doesn't remove WAL segments in remote storage. - pub(crate) async fn delete( + /// Delete timeline, only locally on this node or globally (also cleaning + /// remote storage WAL), depending on `action` value. + pub(crate) async fn delete_or_exclude( &self, ttid: &TenantTimelineId, - only_local: bool, - ) -> Result { + action: DeleteOrExclude, + ) -> Result { let tli_res = { let state = self.state.lock().unwrap(); if state.tombstones.contains_key(ttid) { // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. info!("Timeline {ttid} was already deleted"); - return Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active: false, - }); + return Ok(TimelineDeleteResult { dir_existed: false }); } state.get(ttid) @@ -472,32 +469,47 @@ impl GlobalTimelines { let result = match tli_res { Ok(timeline) => { - let was_active = timeline.broker_active.load(Ordering::Relaxed); + info!("deleting timeline {}, action={:?}", ttid, action); - info!("deleting timeline {}, only_local={}", ttid, only_local); - timeline.shutdown().await; + // If node is getting excluded, check the generation first. + // Then, while holding the lock cancel the timeline; it will be + // unusable after this point, and if node is added back first + // deletion must be completed and node seeded anew. + // + // We would like to avoid holding the lock while waiting for the + // gate to finish as this is deadlock prone, so for actual + // deletion will take it second time. + if let DeleteOrExclude::Exclude(ref mconf) = action { + let shared_state = timeline.read_shared_state().await; + if shared_state.sk.state().mconf.generation > mconf.generation { + return Err(DeleteOrExcludeError::Conflict { + requested: mconf.clone(), + current: shared_state.sk.state().mconf.clone(), + }); + } + timeline.cancel().await; + } else { + timeline.cancel().await; + } + + timeline.close().await; info!("timeline {ttid} shut down for deletion"); // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; + let only_local = !matches!(action, DeleteOrExclude::Delete); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active, // TODO: we probably should remove this field - }) + Ok(TimelineDeleteResult { dir_existed }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid); - let dir_existed = delete_dir(dir_path)?; + let dir_existed = delete_dir(&dir_path).await?; - Ok(TimelineDeleteForceResult { - dir_existed, - was_active: false, - }) + Ok(TimelineDeleteResult { dir_existed }) } }; @@ -515,11 +527,11 @@ impl GlobalTimelines { /// retry tenant deletion again later. /// /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete_force_all_for_tenant( + pub async fn delete_all_for_tenant( &self, tenant_id: &TenantId, - only_local: bool, - ) -> Result> { + action: DeleteOrExclude, + ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let to_delete = self.get_all_for_tenant(*tenant_id); @@ -527,7 +539,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match self.delete(&tli.ttid, only_local).await { + match self.delete_or_exclude(&tli.ttid, action.clone()).await { Ok(result) => { deleted.insert(tli.ttid, result); } @@ -541,17 +553,15 @@ impl GlobalTimelines { // If there was an error, return it. if let Some(e) = err { - return Err(e); + return Err(anyhow::Error::from(e)); } // There may be broken timelines on disk, so delete the whole tenant dir as well. // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir(get_tenant_dir( - self.state.lock().unwrap().conf.as_ref(), - tenant_id, - ))?; + let tenant_dir = get_tenant_dir(self.state.lock().unwrap().conf.as_ref(), tenant_id); + delete_dir(&tenant_dir).await?; Ok(deleted) } @@ -570,18 +580,20 @@ impl GlobalTimelines { } #[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { +pub struct TimelineDeleteResult { pub dir_existed: bool, - pub was_active: bool, } -/// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: Utf8PathBuf) -> Result { - match std::fs::remove_dir_all(path) { - Ok(_) => Ok(true), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(e) => Err(e.into()), - } +/// Action for delete_or_exclude. +#[derive(Clone, Debug)] +pub enum DeleteOrExclude { + /// Delete timeline globally. + Delete, + /// Legacy mode until we fully migrate to generations: like exclude deletes + /// timeline only locally, but ignores generation number. + DeleteLocal, + /// This node is getting excluded, delete timeline locally. + Exclude(membership::Configuration), } /// Create temp directory for a new timeline. It needs to be located on the same diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 6176e64698..56f4a2faf9 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::backoff; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; +use utils::{backoff, pausable_failpoint}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; use crate::timeline::WalResidentTimeline; @@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { // We don't currently have http requests timeout cancellation, but if/once // we have listing should get streaming interface to make progress. + pausable_failpoint!("sk-delete-timeline-remote-pause"); + + fail::fail_point!("sk-delete-timeline-remote", |_| { + Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote")) + }); + let cancel = CancellationToken::new(); // not really used backoff::retry( || async { diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index cc9d4e6e3b..aab82fedb5 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -246,7 +246,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None) .await .unwrap(); let end_pos = end_watch.get(); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ed197a3f83..f0bac4b40a 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -574,6 +574,7 @@ impl Storage for PhysicalStorage { } self.pending_wal_truncation = false; + info!("truncated WAL to {}", end_pos); Ok(()) } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index b63ba154da..8211bdce62 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -19,8 +19,10 @@ bytes.workspace = true chrono.workspace = true clap.workspace = true cron.workspace = true +clashmap.workspace = true fail.workspace = true futures.workspace = true +governor.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index f8a2790769..7888b18aa7 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,6 +1,7 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; pub struct Client { base_url: Url, diff --git a/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql new file mode 100644 index 0000000000..8f75e8947e --- /dev/null +++ b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/down.sql @@ -0,0 +1,2 @@ +DROP TABLE timelines; +DROP TABLE safekeeper_timeline_pending_ops; diff --git a/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql new file mode 100644 index 0000000000..82003ab292 --- /dev/null +++ b/storage_controller/migrations/2025-02-14-160526_safekeeper_timelines/up.sql @@ -0,0 +1,19 @@ +CREATE TABLE timelines ( + tenant_id VARCHAR NOT NULL, + timeline_id VARCHAR NOT NULL, + start_lsn pg_lsn NOT NULL, + generation INTEGER NOT NULL, + sk_set BIGINT[] NOT NULL, + new_sk_set BIGINT[], + cplane_notified_generation INTEGER NOT NULL, + deleted_at timestamptz, + PRIMARY KEY(tenant_id, timeline_id) +); +CREATE TABLE safekeeper_timeline_pending_ops ( + sk_id BIGINT NOT NULL, + tenant_id VARCHAR NOT NULL, + timeline_id VARCHAR NOT NULL, + generation INTEGER NOT NULL, + op_kind VARCHAR NOT NULL, + PRIMARY KEY(tenant_id, timeline_id, sk_id) +); diff --git a/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql new file mode 100644 index 0000000000..378e9f8c16 --- /dev/null +++ b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP https_port; diff --git a/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql new file mode 100644 index 0000000000..bb47b0b256 --- /dev/null +++ b/storage_controller/migrations/2025-02-28-141741_safekeeper_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ADD https_port INTEGER; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 56a331becd..ee4c9ef9cd 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -8,6 +8,7 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; use pageserver_api::models::PageserverUtilization; +use reqwest::Certificate; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; use thiserror::Error; @@ -27,6 +28,7 @@ struct HeartbeaterTask { max_offline_interval: Duration, max_warming_up_interval: Duration, jwt_token: Option, + ssl_ca_cert: Option, } #[derive(Debug, Clone)] @@ -75,6 +77,7 @@ where { pub(crate) fn new( jwt_token: Option, + ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -84,6 +87,7 @@ where let mut heartbeater = HeartbeaterTask::new( receiver, jwt_token, + ssl_ca_cert, max_offline_interval, max_warming_up_interval, cancel, @@ -119,6 +123,7 @@ where fn new( receiver: tokio::sync::mpsc::UnboundedReceiver>, jwt_token: Option, + ssl_ca_cert: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, @@ -130,6 +135,7 @@ where max_offline_interval, max_warming_up_interval, jwt_token, + ssl_ca_cert, } } async fn run(&mut self) { @@ -172,6 +178,7 @@ impl HeartBeat for HeartbeaterTask let mut heartbeat_futs = FuturesUnordered::new(); for (node_id, node) in &*pageservers { heartbeat_futs.push({ + let ssl_ca_cert = self.ssl_ca_cert.clone(); let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); @@ -187,6 +194,7 @@ impl HeartBeat for HeartbeaterTask .with_client_retries( |client| async move { client.get_utilization().await }, &jwt_token, + &ssl_ca_cert, 3, 3, Duration::from_secs(1), @@ -325,6 +333,7 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask, auth: Option>, + rate_limiter: governor::DefaultKeyedRateLimiter, neon_metrics: NeonMetrics, allowlist_routes: &'static [&'static str], } @@ -59,9 +61,11 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { + let quota = governor::Quota::per_second(service.get_config().tenant_rate_limit); Self { service, auth, + rate_limiter: governor::RateLimiter::keyed(quota), neon_metrics: NeonMetrics::new(build_info), allowlist_routes: &[ "/status", @@ -82,6 +86,40 @@ fn get_state(request: &Request) -> &HttpState { .as_ref() } +/// Rate limits tenant requests. +/// +/// TODO: this should be a request middleware, but requires us to extract the tenant ID from +/// different URLs in a systematic way. +/// +/// TODO: consider returning a 429 response if these start piling up. +async fn maybe_rate_limit(request: &Request, tenant_id: TenantId) { + // Check if the tenant should be rate-limited. + let rate_limiter = &get_state(request).rate_limiter; + if rate_limiter.check_key(&tenant_id).is_ok() { + return; + } + + // Measure the rate limiting delay. + let _timer = METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_rate_limited + .start_timer(); + + // Log rate limited tenants once every 10 seconds. + static LOG_RATE_LIMITER: LazyLock> = + LazyLock::new(|| { + let quota = governor::Quota::with_period(Duration::from_secs(10)).unwrap(); + governor::RateLimiter::keyed(quota) + }); + + if LOG_RATE_LIMITER.check_key(&tenant_id).is_ok() { + warn!("tenant {tenant_id} is rate limited") + } + + // Wait for quota. + rate_limiter.until_key_ready(&tenant_id).await; +} + /// Pageserver calls into this on startup, to learn which tenants it should attach async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -247,6 +285,7 @@ async fn handle_tenant_config_get( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -264,6 +303,7 @@ async fn handle_tenant_time_travel_remote_storage( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -311,6 +351,7 @@ async fn handle_tenant_secondary_download( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -329,6 +370,7 @@ async fn handle_tenant_delete( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -356,6 +398,7 @@ async fn handle_tenant_timeline_create( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -381,6 +424,7 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -457,6 +501,7 @@ async fn handle_tenant_timeline_archival_config( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -482,6 +527,7 @@ async fn handle_tenant_timeline_detach_ancestor( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -504,6 +550,7 @@ async fn handle_tenant_timeline_block_unblock_gc( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; @@ -521,12 +568,14 @@ async fn handle_tenant_timeline_download_heatmap_layers( let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_shard_id.tenant_id).await; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; let concurrency: Option = parse_query_param(&req, "concurrency")?; + let recurse = parse_query_param(&req, "recurse")?.unwrap_or(false); service - .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency, recurse) .await?; json_response(StatusCode::OK, ()) @@ -547,8 +596,9 @@ async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_or_shard_id.tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -562,15 +612,28 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; - tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + tracing::info!( + "Proxying request for tenant {} ({})", + tenant_or_shard_id.tenant_id, + path + ); // Find the node that holds shard zero - let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() { + service + .tenant_shard0_node(tenant_or_shard_id.tenant_id) + .await? + } else { + ( + service.tenant_shard_node(tenant_or_shard_id).await?, + tenant_or_shard_id, + ) + }; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. let path = format!("{}", path); - let tenant_str = tenant_id.to_string(); + let tenant_str = tenant_or_shard_id.tenant_id.to_string(); let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); @@ -594,7 +657,9 @@ async fn handle_tenant_timeline_passthrough( let client = mgmt_api::Client::new( node.base_url(), service.get_config().pageserver_jwt_token.as_deref(), - ); + service.get_config().ssl_ca_cert.clone(), + ) + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. @@ -610,7 +675,7 @@ async fn handle_tenant_timeline_passthrough( // Transform 404 into 503 if we raced with a migration if resp.status() == reqwest::StatusCode::NOT_FOUND { // Look up node again: if we migrated it will be different - let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?; + let new_node = service.tenant_shard_node(tenant_shard_id).await?; if new_node.get_id() != node.get_id() { // Rather than retry here, send the client a 503 to prompt a retry: this matches // the pageserver's use of 503, and all clients calling this API should retry on 503. @@ -640,6 +705,7 @@ async fn handle_tenant_locate( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -655,9 +721,9 @@ async fn handle_tenant_describe( service: Arc, req: Request, ) -> Result, ApiError> { - check_permissions(&req, Scope::Scrubber)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Scrubber)?; + // NB: don't rate limit: scrubber operation. match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -992,6 +1058,7 @@ async fn handle_tenant_shard_split( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1014,6 +1081,7 @@ async fn handle_tenant_shard_migrate( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1037,6 +1105,7 @@ async fn handle_tenant_shard_migrate_secondary( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1060,6 +1129,7 @@ async fn handle_tenant_shard_cancel_reconcile( req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1079,6 +1149,7 @@ async fn handle_tenant_shard_cancel_reconcile( async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + // NB: don't rate limit: admin operation. let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1134,9 +1205,9 @@ async fn handle_step_down(req: Request) -> Result, ApiError } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1151,9 +1222,9 @@ async fn handle_tenant_drop(req: Request) -> Result, ApiErr } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::PageServerApi)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 04dd3bb3f6..46ac1cd7ca 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,3 +1,4 @@ +use std::num::NonZeroU32; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -7,6 +8,7 @@ use clap::Parser; use hyper0::Uri; use metrics::BuildInfo; use metrics::launch_timestamp::LaunchTimestamp; +use reqwest::Certificate; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; @@ -98,6 +100,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. + #[arg(long, default_value = "10")] + tenant_rate_limit: NonZeroU32, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -123,21 +129,33 @@ struct Cli { #[arg(long)] chaos_exit_crontab: Option, - // Maximum acceptable lag for the secondary location while draining - // a pageserver + /// Maximum acceptable lag for the secondary location while draining + /// a pageserver #[arg(long)] max_secondary_lag_bytes: Option, - // Period with which to send heartbeats to registered nodes + /// Period with which to send heartbeats to registered nodes #[arg(long)] heartbeat_interval: Option, #[arg(long)] long_reconcile_threshold: Option, - // Flag to use https for requests to pageserver API. + /// Flag to use https for requests to pageserver API. #[arg(long, default_value = "false")] use_https_pageserver_api: bool, + + // Whether to put timelines onto safekeepers + #[arg(long, default_value = "false")] + timelines_onto_safekeepers: bool, + + /// Flag to use https for requests to safekeeper API. + #[arg(long, default_value = "false")] + use_https_safekeeper_api: bool, + + /// Trusted root CA certificate to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } enum StrictMode { @@ -281,18 +299,13 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; - // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below - tracing::info!( - "safekeeper_jwt_token set: {:?}", - secrets.safekeeper_jwt_token.is_some() - ); - // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() || secrets.pageserver_jwt_token.is_none() - || secrets.control_plane_jwt_token.is_none()) => + || secrets.control_plane_jwt_token.is_none() + || secrets.safekeeper_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set // then we would implicitly disable auth. @@ -315,6 +328,15 @@ async fn async_main() -> anyhow::Result<()> { } } + let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(Certificate::from_pem(&buf)?) + } + None => None, + }; + let config = Config { pageserver_jwt_token: secrets.pageserver_jwt_token, safekeeper_jwt_token: secrets.safekeeper_jwt_token, @@ -335,6 +357,7 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, @@ -350,6 +373,9 @@ async fn async_main() -> anyhow::Result<()> { start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, use_https_pageserver_api: args.use_https_pageserver_api, + use_https_safekeeper_api: args.use_https_safekeeper_api, + ssl_ca_cert, + timelines_onto_safekeepers: args.timelines_onto_safekeepers, }; // Validate that we can connect to the database diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index f490edb68f..ea390df726 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -76,6 +76,10 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_http_request_latency: measured::HistogramVec, + /// HTTP rate limiting latency across all tenants and endpoints + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))] + pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>, + /// Count of HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_pageserver_request_error: diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index bc7fe8802a..40f3c7c58e 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,14 +1,13 @@ use std::str::FromStr; use std::time::Duration; -use anyhow::anyhow; use pageserver_api::controller_api::{ AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -211,7 +210,10 @@ impl Node { use_https: bool, ) -> anyhow::Result { if use_https && listen_https_port.is_none() { - return Err(anyhow!("https is enabled, but node has no https port")); + anyhow::bail!( + "cannot create node {id}: \ + https is enabled, but https port is not specified" + ); } Ok(Self { @@ -244,7 +246,11 @@ impl Node { pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { if use_https && np.listen_https_port.is_none() { - return Err(anyhow!("https is enabled, but node has no https port")); + anyhow::bail!( + "cannot load node {} from persistent: \ + https is enabled, but https port is not specified", + np.node_id, + ); } Ok(Self { @@ -270,10 +276,12 @@ impl Node { /// This will return None to indicate cancellation. Cancellation may happen from /// the cancellation token passed in, or from Self's cancellation token (i.e. node /// going offline). + #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( &self, mut op: O, jwt: &Option, + ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -292,19 +300,26 @@ impl Node { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, + CreateClient(_) => true, } } + // TODO: refactor PageserverClient and with_client_retires (#11113). + let mut http_client = reqwest::ClientBuilder::new().timeout(timeout); + if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { + http_client = http_client.add_root_certificate(ssl_ca_cert.clone()) + } + + let http_client = match http_client.build() { + Ok(http_client) => http_client, + Err(err) => return Some(Err(mgmt_api::Error::CreateClient(err))), + }; + backoff::retry( || { - let http_client = reqwest::ClientBuilder::new() - .timeout(timeout) - .build() - .expect("Failed to construct HTTP client"); - let client = PageserverClient::from_client( self.get_id(), - http_client, + http_client.clone(), self.base_url(), jwt.as_deref(), ); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index e9c54414a3..7fd4f37e7e 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -8,7 +8,7 @@ use pageserver_api::models::{ use pageserver_api::shard::TenantShardId; use pageserver_client::BlockUnblock; use pageserver_client::mgmt_api::{Client, Result}; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; use utils::id::{NodeId, TenantId, TimelineId}; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage @@ -46,11 +46,16 @@ macro_rules! measured_request { } impl PageserverClient { - pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { - Self { - inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + pub(crate) fn new( + node_id: NodeId, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ssl_ca_cert: Option, + ) -> Result { + Ok(Self { + inner: Client::new(mgmt_api_endpoint, jwt, ssl_ca_cert)?, node_id_label: node_id.0.to_string(), - } + }) } pub(crate) fn from_client( @@ -281,13 +286,19 @@ impl PageserverClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<()> { measured_request!( "download_heatmap_layers", crate::metrics::Method::Post, &self.node_id_label, self.inner - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse + ) .await ) } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index d34da0fef0..5146fe472e 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1,10 +1,15 @@ pub(crate) mod split_state; use std::collections::HashMap; +use std::io::Write; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; +use diesel::pg::Pg; use diesel::prelude::*; +use diesel::serialize::{IsNull, ToSql}; use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; use diesel_async::pooled_connection::bb8::Pool; use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig}; @@ -27,7 +32,8 @@ use rustls::crypto::ring; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use self::split_state::SplitState; use crate::metrics::{ @@ -115,6 +121,11 @@ pub(crate) enum DatabaseOperation { GetLeader, UpdateLeader, SetPreferredAzs, + InsertTimeline, + GetTimeline, + InsertTimelineReconcile, + RemoveTimelineReconcile, + ListTimelineReconcile, } #[must_use] @@ -1274,6 +1285,166 @@ impl Persistence { }) .await } + + /// Persist timeline. Returns if the timeline was newly inserted. If it wasn't, we haven't done any writes. + pub(crate) async fn insert_timeline(&self, entry: TimelinePersistence) -> DatabaseResult { + use crate::schema::timelines; + + let entry = &entry; + self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| { + Box::pin(async move { + let inserted_updated = diesel::insert_into(timelines::table) + .values(entry) + .on_conflict((timelines::tenant_id, timelines::timeline_id)) + .do_nothing() + .execute(conn) + .await?; + + match inserted_updated { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))), + } + }) + }) + .await + } + + /// Load timeline from db. Returns `None` if not present. + pub(crate) async fn get_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + let timeline_from_db = self + .with_measured_conn(DatabaseOperation::GetTimeline, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timelines + .filter( + dsl::tenant_id + .eq(&tenant_id.to_string()) + .and(dsl::timeline_id.eq(&timeline_id.to_string())), + ) + .load(conn) + .await?; + if from_db.is_empty() { + return Ok(None); + } + if from_db.len() != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(Some(from_db.pop().unwrap().into_persistence())) + }) + }) + .await?; + + Ok(timeline_from_db) + } + /// Persist pending op. Returns if it was newly inserted. If it wasn't, we haven't done any writes. + pub(crate) async fn insert_pending_op( + &self, + entry: TimelinePendingOpPersistence, + ) -> DatabaseResult { + use crate::schema::safekeeper_timeline_pending_ops as skpo; + // This overrides the `filter` fn used in other functions, so contain the mayhem via a function-local use + use diesel::query_dsl::methods::FilterDsl; + + let entry = &entry; + self.with_measured_conn(DatabaseOperation::InsertTimelineReconcile, move |conn| { + Box::pin(async move { + // For simplicity it makes sense to keep only the last operation + // per (tenant, timeline, sk) tuple: if we migrated a timeline + // from node and adding it back it is not necessary to remove + // data on it. Hence, generation is not part of primary key and + // we override any rows with lower generations here. + let inserted_updated = diesel::insert_into(skpo::table) + .values(entry) + .on_conflict((skpo::tenant_id, skpo::timeline_id, skpo::sk_id)) + .do_update() + .set(entry) + .filter(skpo::generation.lt(entry.generation)) + .execute(conn) + .await?; + + match inserted_updated { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))), + } + }) + }) + .await + } + /// Remove persisted pending op. + pub(crate) async fn remove_pending_op( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + sk_id: NodeId, + generation: u32, + ) -> DatabaseResult<()> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::RemoveTimelineReconcile, move |conn| { + Box::pin(async move { + diesel::delete(dsl::safekeeper_timeline_pending_ops) + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .filter(dsl::sk_id.eq(sk_id.0 as i64)) + .filter(dsl::generation.eq(generation as i32)) + .execute(conn) + .await?; + Ok(()) + }) + }) + .await + } + + /// Load pending operations from db. + pub(crate) async fn list_pending_ops( + &self, + filter_for_sk: Option, + ) -> DatabaseResult> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + const FILTER_VAL_1: i64 = 1; + const FILTER_VAL_2: i64 = 2; + let filter_opt = filter_for_sk.map(|id| id.0 as i64); + let timeline_from_db = self + .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { + Box::pin(async move { + let from_db: Vec = + dsl::safekeeper_timeline_pending_ops + .filter( + dsl::sk_id + .eq(filter_opt.unwrap_or(FILTER_VAL_1)) + .and(dsl::sk_id.eq(filter_opt.unwrap_or(FILTER_VAL_2))), + ) + .load(conn) + .await?; + Ok(from_db) + }) + }) + .await?; + + Ok(timeline_from_db) + } } pub(crate) fn load_certs() -> anyhow::Result> { @@ -1556,7 +1727,34 @@ pub(crate) struct SafekeeperPersistence { pub(crate) port: i32, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, - pub(crate) scheduling_policy: String, + pub(crate) scheduling_policy: SkSchedulingPolicyFromSql, + pub(crate) https_port: Option, +} + +/// Wrapper struct around [`SkSchedulingPolicy`] because both it and [`FromSql`] are from foreign crates, +/// and we don't want to make [`safekeeper_api`] depend on [`diesel`]. +#[derive(Serialize, Deserialize, FromSqlRow, Eq, PartialEq, Debug, Copy, Clone)] +pub(crate) struct SkSchedulingPolicyFromSql(pub(crate) SkSchedulingPolicy); + +impl From for SkSchedulingPolicyFromSql { + fn from(value: SkSchedulingPolicy) -> Self { + SkSchedulingPolicyFromSql(value) + } +} + +impl FromSql for SkSchedulingPolicyFromSql { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match SkSchedulingPolicy::from_str(s) { + Ok(policy) => Ok(SkSchedulingPolicyFromSql(policy)), + Err(e) => Err(format!("can't parse: {e}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for scheduling policy: {e}").into()), + } + } } impl SafekeeperPersistence { @@ -1571,15 +1769,12 @@ impl SafekeeperPersistence { host: upsert.host, port: upsert.port, http_port: upsert.http_port, + https_port: upsert.https_port, availability_zone_id: upsert.availability_zone_id, - scheduling_policy: String::from(scheduling_policy), + scheduling_policy: SkSchedulingPolicyFromSql(scheduling_policy), } } pub(crate) fn as_describe_response(&self) -> Result { - let scheduling_policy = - SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { - DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) - })?; Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1587,8 +1782,9 @@ impl SafekeeperPersistence { host: self.host.clone(), port: self.port, http_port: self.http_port, + https_port: self.https_port, availability_zone_id: self.availability_zone_id.clone(), - scheduling_policy, + scheduling_policy: self.scheduling_policy.0, }) } } @@ -1607,6 +1803,7 @@ pub(crate) struct SafekeeperUpsert { /// The active flag will not be stored in the database and will be ignored. pub(crate) active: Option, pub(crate) http_port: i32, + pub(crate) https_port: Option, pub(crate) availability_zone_id: String, } @@ -1622,6 +1819,7 @@ impl SafekeeperUpsert { host: &self.host, port: self.port, http_port: self.http_port, + https_port: self.https_port, availability_zone_id: &self.availability_zone_id, // None means a wish to not update this column. We expose abilities to update it via other means. scheduling_policy: None, @@ -1638,6 +1836,143 @@ struct InsertUpdateSafekeeper<'a> { host: &'a str, port: i32, http_port: i32, + https_port: Option, availability_zone_id: &'a str, scheduling_policy: Option<&'a str>, } + +#[derive(Serialize, Deserialize, FromSqlRow, AsExpression, Eq, PartialEq, Debug, Copy, Clone)] +#[diesel(sql_type = crate::schema::sql_types::PgLsn)] +pub(crate) struct LsnWrapper(pub(crate) Lsn); + +impl From for LsnWrapper { + fn from(value: Lsn) -> Self { + LsnWrapper(value) + } +} + +impl FromSql for LsnWrapper { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let byte_arr: diesel::deserialize::Result<[u8; 8]> = bytes + .as_bytes() + .try_into() + .map_err(|_| "Can't obtain lsn from sql".into()); + Ok(LsnWrapper(Lsn(u64::from_be_bytes(byte_arr?)))) + } +} + +impl ToSql for LsnWrapper { + fn to_sql<'b>( + &'b self, + out: &mut diesel::serialize::Output<'b, '_, Pg>, + ) -> diesel::serialize::Result { + out.write_all(&u64::to_be_bytes(self.0.0)) + .map(|_| IsNull::No) + .map_err(Into::into) + } +} + +#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[diesel(table_name = crate::schema::timelines)] +pub(crate) struct TimelinePersistence { + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) start_lsn: LsnWrapper, + pub(crate) generation: i32, + pub(crate) sk_set: Vec, + pub(crate) new_sk_set: Option>, + pub(crate) cplane_notified_generation: i32, + pub(crate) deleted_at: Option>, +} + +/// This is separate from [TimelinePersistence] only because postgres allows NULLs +/// in arrays and there is no way to forbid that at schema level. Hence diesel +/// wants `sk_set` to be `Vec>` instead of `Vec` for +/// Queryable/Selectable. It does however allow insertions without redundant +/// Option(s), so [TimelinePersistence] doesn't have them. +#[derive(Queryable, Selectable)] +#[diesel(table_name = crate::schema::timelines)] +pub(crate) struct TimelineFromDb { + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) start_lsn: LsnWrapper, + pub(crate) generation: i32, + pub(crate) sk_set: Vec>, + pub(crate) new_sk_set: Option>>, + pub(crate) cplane_notified_generation: i32, + pub(crate) deleted_at: Option>, +} + +impl TimelineFromDb { + fn into_persistence(self) -> TimelinePersistence { + // We should never encounter null entries in the sets, but we need to filter them out. + // There is no way to forbid this in the schema that diesel recognizes (to our knowledge). + let sk_set = self.sk_set.into_iter().flatten().collect::>(); + let new_sk_set = self + .new_sk_set + .map(|s| s.into_iter().flatten().collect::>()); + TimelinePersistence { + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, + start_lsn: self.start_lsn, + generation: self.generation, + sk_set, + new_sk_set, + cplane_notified_generation: self.cplane_notified_generation, + deleted_at: self.deleted_at, + } + } +} + +#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[diesel(table_name = crate::schema::safekeeper_timeline_pending_ops)] +pub(crate) struct TimelinePendingOpPersistence { + pub(crate) sk_id: i64, + pub(crate) tenant_id: String, + pub(crate) timeline_id: String, + pub(crate) generation: i32, + pub(crate) op_kind: SafekeeperTimelineOpKind, +} + +#[derive(Serialize, Deserialize, FromSqlRow, AsExpression, Eq, PartialEq, Debug, Copy, Clone)] +#[diesel(sql_type = diesel::sql_types::VarChar)] +pub(crate) enum SafekeeperTimelineOpKind { + Pull, + Exclude, + Delete, +} + +impl FromSql for SafekeeperTimelineOpKind { + fn from_sql( + bytes: ::RawValue<'_>, + ) -> diesel::deserialize::Result { + let bytes = bytes.as_bytes(); + match core::str::from_utf8(bytes) { + Ok(s) => match s { + "pull" => Ok(SafekeeperTimelineOpKind::Pull), + "exclude" => Ok(SafekeeperTimelineOpKind::Exclude), + "delete" => Ok(SafekeeperTimelineOpKind::Delete), + _ => Err(format!("can't parse: {s}").into()), + }, + Err(e) => Err(format!("invalid UTF-8 for op_kind: {e}").into()), + } + } +} + +impl ToSql for SafekeeperTimelineOpKind { + fn to_sql<'b>( + &'b self, + out: &mut diesel::serialize::Output<'b, '_, Pg>, + ) -> diesel::serialize::Result { + let kind_str = match self { + SafekeeperTimelineOpKind::Pull => "pull", + SafekeeperTimelineOpKind::Exclude => "exclude", + SafekeeperTimelineOpKind::Delete => "delete", + }; + out.write_all(kind_str.as_bytes()) + .map(|_| IsNull::No) + .map_err(Into::into) + } +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index a327f6f50f..9f0b789f19 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -299,6 +299,7 @@ impl Reconciler { .await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, timeout, @@ -420,7 +421,8 @@ impl Reconciler { node.get_id(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - ); + self.service_config.ssl_ca_cert.clone(), + )?; client .wait_lsn( @@ -443,7 +445,8 @@ impl Reconciler { node.get_id(), node.base_url(), self.service_config.pageserver_jwt_token.as_deref(), - ); + self.service_config.ssl_ca_cert.clone(), + )?; let timelines = client.timeline_list(&tenant_shard_id).await?; Ok(timelines @@ -481,6 +484,7 @@ impl Reconciler { .await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, request_download_timeout * 2, @@ -775,6 +779,7 @@ impl Reconciler { .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 1, Duration::from_secs(5), @@ -1123,6 +1128,7 @@ impl Reconciler { .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, &self.service_config.pageserver_jwt_token, + &self.service_config.ssl_ca_cert, 1, 3, Duration::from_secs(5), diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 546fbf0726..2bd28f29af 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -1,8 +1,7 @@ -use std::str::FromStr; use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -19,26 +18,56 @@ pub struct Safekeeper { cancel: CancellationToken, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, scheduling_policy: SkSchedulingPolicy, id: NodeId, + /// Heartbeating result. availability: SafekeeperState, + + // Flag from storcon's config to use https for safekeeper API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, } impl Safekeeper { - pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { - let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); - Self { + pub(crate) fn from_persistence( + skp: SafekeeperPersistence, + cancel: CancellationToken, + use_https: bool, + ) -> anyhow::Result { + if use_https && skp.https_port.is_none() { + anyhow::bail!( + "cannot load safekeeper {} from persistence: \ + https is enabled, but https port is not specified", + skp.id, + ); + } + + let scheduling_policy = skp.scheduling_policy.0; + Ok(Self { cancel, listen_http_addr: skp.host.clone(), listen_http_port: skp.http_port as u16, + listen_https_port: skp.https_port.map(|x| x as u16), id: NodeId(skp.id as u64), skp, availability: SafekeeperState::Offline, scheduling_policy, - } + use_https, + }) } + pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on"), + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -55,13 +84,18 @@ impl Safekeeper { } pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { self.scheduling_policy = scheduling_policy; - self.skp.scheduling_policy = String::from(scheduling_policy); + self.skp.scheduling_policy = scheduling_policy.into(); + } + pub(crate) fn availability(&self) -> SafekeeperState { + self.availability.clone() } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries + #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( &self, mut op: O, jwt: &Option, + ssl_ca_cert: &Option, warn_threshold: u32, max_retries: u32, timeout: Duration, @@ -80,19 +114,22 @@ impl Safekeeper { | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, ApiError(_, _) => true, Cancelled => true, + CreateClient(_) => true, } } + // TODO: refactor SafekeeperClient and with_client_retires (#11113). + let mut http_client = reqwest::Client::builder().timeout(timeout); + if let Some(ssl_ca_cert) = ssl_ca_cert.as_ref() { + http_client = http_client.add_root_certificate(ssl_ca_cert.clone()); + } + let http_client = http_client.build().map_err(mgmt_api::Error::CreateClient)?; + backoff::retry( || { - let http_client = reqwest::ClientBuilder::new() - .timeout(timeout) - .build() - .expect("Failed to construct HTTP client"); - - let client = SafekeeperClient::from_client( + let client = SafekeeperClient::new( self.get_id(), - http_client, + http_client.clone(), self.base_url(), jwt.clone(), ); @@ -113,8 +150,9 @@ impl Safekeeper { warn_threshold, max_retries, &format!( - "Call to safekeeper {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to safekeeper {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -122,12 +160,16 @@ impl Safekeeper { .unwrap_or(Err(mgmt_api::Error::Cancelled)) } - pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) { + pub(crate) fn update_from_record( + &mut self, + record: crate::persistence::SafekeeperUpsert, + ) -> anyhow::Result<()> { let crate::persistence::SafekeeperUpsert { active: _, availability_zone_id: _, host, http_port, + https_port, id, port: _, region_id: _, @@ -140,9 +182,17 @@ impl Safekeeper { self.id.0 ); } + if self.use_https && https_port.is_none() { + anyhow::bail!( + "cannot update safekeeper {id}: \ + https is enabled, but https port is not specified" + ); + } self.skp = crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); self.listen_http_port = http_port as u16; + self.listen_https_port = https_port.map(|x| x as u16); self.listen_http_addr = host; + Ok(()) } } diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index fb5be092a0..1533b6c086 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -45,31 +45,18 @@ macro_rules! measured_request { } impl SafekeeperClient { - #[allow(dead_code)] pub(crate) fn new( - node_id: NodeId, - mgmt_api_endpoint: String, - jwt: Option, - ) -> Self { - Self { - inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), - node_id_label: node_id.0.to_string(), - } - } - - pub(crate) fn from_client( node_id: NodeId, raw_client: reqwest::Client, mgmt_api_endpoint: String, jwt: Option, ) -> Self { Self { - inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + inner: Client::new(raw_client, mgmt_api_endpoint, jwt), node_id_label: node_id.0.to_string(), } } - #[allow(dead_code)] pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, @@ -82,7 +69,6 @@ impl SafekeeperClient { ) } - #[allow(dead_code)] pub(crate) async fn delete_timeline( &self, tenant_id: TenantId, @@ -96,7 +82,6 @@ impl SafekeeperClient { ) } - #[allow(dead_code)] pub(crate) async fn pull_timeline( &self, req: &PullTimelineRequest, diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 817cf04fe1..3d5f36fb98 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -409,13 +409,14 @@ impl ScheduleContext { } } -pub(crate) enum RefCountUpdate { +pub(crate) enum RefCountUpdate<'a> { PromoteSecondary, Attach, Detach, DemoteAttached, AddSecondary, RemoveSecondary, + ChangePreferredAzFrom(Option<&'a AvailabilityZone>), } impl Scheduler { @@ -578,6 +579,14 @@ impl Scheduler { node.home_shard_count -= 1; } } + RefCountUpdate::ChangePreferredAzFrom(old_az) => { + if Some(&node.az) == old_az { + node.home_shard_count -= 1; + } + if is_home_az { + node.home_shard_count += 1; + } + } } // Maybe update PageserverUtilization @@ -594,7 +603,8 @@ impl Scheduler { RefCountUpdate::PromoteSecondary | RefCountUpdate::Detach | RefCountUpdate::RemoveSecondary - | RefCountUpdate::DemoteAttached => { + | RefCountUpdate::DemoteAttached + | RefCountUpdate::ChangePreferredAzFrom(_) => { // De-referencing the node: leave the utilization's shard_count at a stale higher // value until some future heartbeat after we have physically removed this shard // from the node: this prevents the scheduler over-optimistically trying to schedule @@ -1535,4 +1545,67 @@ mod tests { shard.intent.clear(&mut scheduler); } } + + #[test] + fn change_preferred_az() { + let az_a = AvailabilityZone("az-a".to_string()); + let az_b = AvailabilityZone("az-b".to_string()); + + // 2 nodes: 1 az_a and 1 az_b. + let nodes = test_utils::make_test_nodes(2, &[az_a.clone(), az_b.clone()]); + let mut scheduler = Scheduler::new(nodes.values()); + + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::generate(), + shard_number: ShardNumber(0), + shard_count: ShardCount(1), + }; + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + // 1 attached and 1 secondary. + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + Some(az_a.clone()), + ); + + let mut context = ScheduleContext::default(); + shard.schedule(&mut scheduler, &mut context).unwrap(); + eprintln!("Scheduled shard at {:?}", shard.intent); + + for node in scheduler.nodes.values() { + // Only 2 nodes, one tenant shard should be scheduled on each of them. + assert_eq!(node.shard_count, 1); + if node.az == az_a { + assert_eq!(node.home_shard_count, 1); + } else { + assert_eq!(node.home_shard_count, 0); + } + } + + shard.set_preferred_az(&mut scheduler, Some(az_b.clone())); + // Home AZ flipped. + for node in scheduler.nodes.values() { + assert_eq!(node.shard_count, 1); + if node.az == az_a { + assert_eq!(node.home_shard_count, 0); + } else { + assert_eq!(node.home_shard_count, 1); + } + } + + shard.set_preferred_az(&mut scheduler, None); + // No home AZ. + for node in scheduler.nodes.values() { + assert_eq!(node.shard_count, 1); + assert_eq!(node.home_shard_count, 0); + } + + shard.intent.clear(&mut scheduler); + } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 361253bd19..9b36376fcb 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,11 @@ // @generated automatically by Diesel CLI. +pub mod sql_types { + #[derive(diesel::query_builder::QueryId, diesel::sql_types::SqlType)] + #[diesel(postgres_type(name = "pg_lsn", schema = "pg_catalog"))] + pub struct PgLsn; +} + diesel::table! { controllers (address, started_at) { address -> Varchar, @@ -30,6 +36,16 @@ diesel::table! { } } +diesel::table! { + safekeeper_timeline_pending_ops (tenant_id, timeline_id, sk_id) { + sk_id -> Int8, + tenant_id -> Varchar, + timeline_id -> Varchar, + generation -> Int4, + op_kind -> Varchar, + } +} + diesel::table! { safekeepers (id) { id -> Int8, @@ -40,6 +56,7 @@ diesel::table! { http_port -> Int4, availability_zone_id -> Text, scheduling_policy -> Varchar, + https_port -> Nullable, } } @@ -59,10 +76,28 @@ diesel::table! { } } +diesel::table! { + use diesel::sql_types::*; + use super::sql_types::PgLsn; + + timelines (tenant_id, timeline_id) { + tenant_id -> Varchar, + timeline_id -> Varchar, + start_lsn -> PgLsn, + generation -> Int4, + sk_set -> Array>, + new_sk_set -> Nullable>>, + cplane_notified_generation -> Int4, + deleted_at -> Nullable, + } +} + diesel::allow_tables_to_appear_in_same_query!( controllers, metadata_health, nodes, + safekeeper_timeline_pending_ops, safekeepers, tenant_shards, + timelines, ); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8671e340bd..a06748abc6 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,11 +1,13 @@ pub mod chaos_injector; mod context_iterator; +pub(crate) mod safekeeper_reconciler; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::error::Error; -use std::ops::Deref; +use std::num::NonZeroU32; +use std::ops::{Deref, DerefMut}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -33,11 +35,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, - SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, - TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, - TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, - TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, - TopTenantShardsRequest, + SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, TenantConfig, + TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, @@ -47,15 +50,19 @@ use pageserver_api::upcall_api::{ ValidateResponseTenant, }; use pageserver_client::{BlockUnblock, mgmt_api}; -use reqwest::StatusCode; +use reqwest::{Certificate, StatusCode}; +use safekeeper_api::membership::{MemberSet, SafekeeperId}; use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_reconciler::{SafekeeperReconcilers, ScheduleRequest}; use tokio::sync::TryAcquireError; use tokio::sync::mpsc::error::TrySendError; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{Instrument, instrument}; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; use utils::sync::gate::Gate; use utils::{failpoint_support, pausable_failpoint}; @@ -76,15 +83,17 @@ use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; use crate::persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, - MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, - TenantShardPersistence, + MetadataHealthPersistence, Persistence, SafekeeperTimelineOpKind, ShardGenerationState, + TenantFilter, TenantShardPersistence, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::reconciler::{ ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, attached_location_conf, }; use crate::safekeeper::Safekeeper; -use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler}; +use crate::scheduler::{ + AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler, +}; use crate::tenant_shard::{ IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation, ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter, @@ -199,6 +208,8 @@ struct ServiceState { safekeepers: Arc>, + safekeeper_reconcilers: SafekeeperReconcilers, + scheduler: Scheduler, /// Ongoing background operation on the cluster if any is running. @@ -259,6 +270,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { ApiError::Conflict(format!("{node} {status}: {status} {msg}")) } mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + mgmt_api::Error::CreateClient(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), } } @@ -270,6 +282,7 @@ impl ServiceState { scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, initial_leadership_status: LeadershipStatus, + reconcilers_cancel: CancellationToken, ) -> Self { metrics::update_leadership_status(initial_leadership_status); @@ -278,6 +291,7 @@ impl ServiceState { tenants, nodes: Arc::new(nodes), safekeepers: Arc::new(safekeepers), + safekeeper_reconcilers: SafekeeperReconcilers::new(reconcilers_cancel), scheduler, ongoing_operation: None, delayed_reconcile_rx, @@ -365,8 +379,16 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, - /// How large must a shard grow in bytes before we split it? - /// None disables auto-splitting. + /// How many API requests per second to allow per tenant, across all + /// tenant-scoped API endpoints. Further API requests queue until ready. + pub tenant_rate_limit: NonZeroU32, + + /// The size at which an unsharded tenant should be split (into 8 shards). This uses the logical + /// size of the largest timeline in the shard (i.e. max_logical_size). + /// + /// None or 0 disables auto-splitting. + /// + /// TODO: consider using total logical size of all timelines instead. pub split_threshold: Option, // TODO: make this cfg(feature = "testing") @@ -389,6 +411,12 @@ pub struct Config { pub long_reconcile_threshold: Duration, pub use_https_pageserver_api: bool, + + pub use_https_safekeeper_api: bool, + + pub ssl_ca_cert: Option, + + pub timelines_onto_safekeepers: bool, } impl From for ApiError { @@ -727,7 +755,27 @@ impl Service { std::process::exit(1); } - self.inner.write().unwrap().become_leader(); + let safekeepers = self.inner.read().unwrap().safekeepers.clone(); + let sk_schedule_requests = + match safekeeper_reconciler::load_schedule_requests(self, &safekeepers).await { + Ok(v) => v, + Err(e) => { + tracing::warn!( + "Failed to load safekeeper pending ops at startup: {e}." // Don't abort for now: " Aborting start-up..." + ); + // std::process::exit(1); + Vec::new() + } + }; + + { + let mut locked = self.inner.write().unwrap(); + locked.become_leader(); + + locked + .safekeeper_reconcilers + .schedule_request_vec(self, sk_schedule_requests); + } // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that // generation_pageserver in the database. @@ -873,6 +921,7 @@ impl Service { .with_client_retries( |client| async move { client.list_location_config().await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, timeout, @@ -970,11 +1019,20 @@ impl Service { break; } - let client = PageserverClient::new( + let client = match PageserverClient::new( node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) { + Ok(client) => client, + Err(e) => { + tracing::error!( + "Failed to create client to detach unknown shard {tenant_shard_id} on pageserver {node_id}: {e}" + ); + continue; + } + }; match client .location_config( tenant_shard_id, @@ -1001,7 +1059,7 @@ impl Service { // Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't // break anything. tracing::error!( - "Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}" + "Failed to detach unknown shard {tenant_shard_id} on pageserver {node_id}: {e}" ); } } @@ -1034,6 +1092,7 @@ impl Service { } } } + /// Heartbeat all storage nodes once in a while. #[instrument(skip_all)] async fn spawn_heartbeat_driver(&self) { self.startup_complete.clone().wait().await; @@ -1409,8 +1468,14 @@ impl Service { .list_safekeepers() .await? .into_iter() - .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new())) - .collect::>(); + .map(|skp| { + Safekeeper::from_persistence( + skp, + CancellationToken::new(), + config.use_https_safekeeper_api, + ) + }) + .collect::>>()?; let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); @@ -1548,6 +1613,7 @@ impl Service { let heartbeater_ps = Heartbeater::new( config.pageserver_jwt_token.clone(), + config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1555,6 +1621,7 @@ impl Service { let heartbeater_sk = Heartbeater::new( config.safekeeper_jwt_token.clone(), + config.ssl_ca_cert.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1574,6 +1641,7 @@ impl Service { scheduler, delayed_reconcile_rx, initial_leadership_status, + reconcilers_cancel.clone(), ))), config: config.clone(), persistence, @@ -1856,7 +1924,7 @@ impl Service { } Ok(AttachHookResponse { - r#gen: attach_req + generation: attach_req .node_id .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) @@ -1902,6 +1970,7 @@ impl Service { .with_client_retries( |client| async move { client.list_location_config().await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1960,6 +2029,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -3103,7 +3173,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3164,7 +3236,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; futs.push(async move { let result = client .tenant_secondary_download(tenant_shard_id, wait) @@ -3287,6 +3361,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 3, RECONCILE_TIMEOUT, @@ -3408,7 +3483,7 @@ impl Service { Ok(()) } - pub(crate) async fn tenant_timeline_create( + pub(crate) async fn tenant_timeline_create_pageservers( &self, tenant_id: TenantId, mut create_req: TimelineCreateRequest, @@ -3419,14 +3494,6 @@ impl Service { create_req.new_timeline_id, ); - let _tenant_lock = trace_shared_lock( - &self.tenant_op_locks, - tenant_id, - TenantOperations::TimelineCreate, - ) - .await; - failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); - self.tenant_remote_mutation(tenant_id, move |mut targets| async move { if targets.0.is_empty() { return Err(ApiError::NotFound( @@ -3442,6 +3509,7 @@ impl Service { tenant_shard_id: TenantShardId, locations: ShardMutationLocations, jwt: Option, + ssl_ca_cert: Option, create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3454,7 +3522,8 @@ impl Service { ); let client = - PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref()); + PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref(), ssl_ca_cert.clone()) + .map_err(|e| passthrough_api_error(&latest, e))?; let timeline_info = client .timeline_create(tenant_shard_id, &create_req) @@ -3477,7 +3546,9 @@ impl Service { location.node.get_id(), location.node.base_url(), jwt.as_deref(), - ); + ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&location.node, e))?; let res = client .timeline_create(tenant_shard_id, &create_req) @@ -3506,6 +3577,7 @@ impl Service { shard_zero_tid, shard_zero_locations, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), create_req.clone(), ) .await?; @@ -3535,6 +3607,7 @@ impl Service { tenant_shard_id, mutation_locations, jwt.clone(), + self.config.ssl_ca_cert.clone(), create_req, )) }, @@ -3547,6 +3620,323 @@ impl Service { .await? } + /// Timeline creation on safekeepers + /// + /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, + /// where `left` contains the list of safekeepers that didn't have a successful response. + /// Assumes tenant lock is held while calling this function. + async fn tenant_timeline_create_safekeepers_quorum( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: u32, + timeline_persistence: &TimelinePersistence, + ) -> Result, ApiError> { + // If quorum is reached, return if we are outside of a specified timeout + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + let mut members = Vec::new(); + for sk_id in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk_id as u64); + let Some(safekeeper) = safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find entry for safekeeper with id {sk_id}" + )))?; + }; + members.push(SafekeeperId { + id: sk_id, + host: safekeeper.skp.host.clone(), + pg_port: safekeeper.skp.port as u16, + }); + } + let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mconf = safekeeper_api::membership::Configuration::new(mset); + + let req = safekeeper_api::models::TimelineCreateRequest { + commit_lsn: None, + mconf, + pg_version, + start_lsn: timeline_persistence.start_lsn.0, + system_id: None, + tenant_id, + timeline_id, + wal_seg_size: None, + }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + for sk in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk as u64); + let safekeepers = safekeepers.clone(); + let jwt = jwt.clone(); + let ssl_ca_cert = self.config.ssl_ca_cert.clone(); + let req = req.clone(); + joinset.spawn(async move { + // Unwrap is fine as we already would have returned error above + let sk_p = safekeepers.get(&sk_id).unwrap(); + let res = sk_p + .with_client_retries( + |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 3, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + &CancellationToken::new(), + ) + .await; + (sk_id, sk_p.skp.host.clone(), res) + }); + } + // After we have built the joinset, we now wait for the tasks to complete, + // but with a specified timeout to make sure we return swiftly, either with + // a failure or success. + let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + + // Wait until all tasks finish or timeout is hit, whichever occurs + // first. + let mut reconcile_results = Vec::new(); + loop { + if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await + { + let Some(res) = res else { break }; + match res { + Ok(res) => { + tracing::info!( + "response from safekeeper id:{} at {}: {:?}", + res.0, + res.1, + res.2 + ); + reconcile_results.push(res); + } + Err(join_err) => { + tracing::info!("join_err for task in joinset: {join_err}"); + } + } + } else { + tracing::info!( + "timeout for creation call after {} responses", + reconcile_results.len() + ); + break; + } + } + + // Now check now if quorum was reached in reconcile_results. + let total_result_count = reconcile_results.len(); + let remaining = reconcile_results + .into_iter() + .filter_map(|res| res.2.is_err().then_some(res.0)) + .collect::>(); + tracing::info!( + "Got {} non-successful responses from initial creation request of total {total_result_count} responses", + remaining.len() + ); + if remaining.len() >= 2 { + // Failure + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful reconciliations to reach quorum, please retry: {} errored", + remaining.len() + ))); + } + + Ok(remaining) + } + + /// Create timeline in controller database and on safekeepers. + /// `timeline_info` is result of timeline creation on pageserver. + /// + /// All actions must be idempotent as the call is retried until success. It + /// tries to create timeline in the db and on at least majority of + /// safekeepers + queue creation for safekeepers which missed it in the db + /// for infinite retries; after that, call returns Ok. + /// + /// The idea is that once this is reached as long as we have alive majority + /// of safekeepers it is expected to get eventually operational as storcon + /// will be able to seed timeline on nodes which missed creation by making + /// pull_timeline from peers. On the other hand we don't want to fail + /// timeline creation if one safekeeper is down. + async fn tenant_timeline_create_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_info: &TimelineInfo, + create_mode: models::TimelineCreateRequestMode, + ) -> Result { + let timeline_id = timeline_info.timeline_id; + let pg_version = timeline_info.pg_version; + // Initially start_lsn is determined by last_record_lsn in pageserver + // response as it does initdb. However, later we persist it and in sk + // creation calls replace with the value from the timeline row if it + // previously existed as on retries in theory endpoint might have + // already written some data and advanced last_record_lsn, while we want + // safekeepers to have consistent start_lsn. + let start_lsn = match create_mode { + models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::ImportPgdata { .. } => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" + )))?; + } + }; + // Choose initial set of safekeepers respecting affinity + let sks = self.safekeepers_for_new_timeline().await?; + let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); + // Add timeline to db + let mut timeline_persist = TimelinePersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + start_lsn: start_lsn.into(), + generation: 0, + sk_set: sks_persistence.clone(), + new_sk_set: None, + cplane_notified_generation: 0, + deleted_at: None, + }; + let inserted = self + .persistence + .insert_timeline(timeline_persist.clone()) + .await?; + if !inserted { + if let Some(existent_persist) = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await? + { + // Replace with what we have in the db, to get stuff like the generation right. + // We do still repeat the http calls to the safekeepers. After all, we could have + // crashed right after the wrote to the DB. + timeline_persist = existent_persist; + } else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "insertion said timeline already in db, but looking it up, it was gone" + ))); + } + } + // Create the timeline on a quorum of safekeepers + let remaining = self + .tenant_timeline_create_safekeepers_quorum( + tenant_id, + timeline_id, + pg_version, + &timeline_persist, + ) + .await?; + + // For the remaining safekeepers, take care of their reconciliation asynchronously + for &remaining_id in remaining.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: timeline_persist.generation, + op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + sk_id: remaining_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {remaining_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + if !remaining.is_empty() { + let mut locked = self.inner.write().unwrap(); + for remaining_id in remaining { + let Some(sk) = locked.safekeepers.get(&remaining_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id}" + ))); + }; + let Ok(host_list) = sks + .iter() + .map(|sk| { + Ok(( + sk.id, + locked + .safekeepers + .get(&sk.id) + .ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id} to pull from" + )) + })? + .base_url(), + )) + }) + .collect::>() + else { + continue; + }; + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + host_list, + tenant_id, + timeline_id, + generation: timeline_persist.generation as u32, + kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + + Ok(SafekeepersInfo { + generation: timeline_persist.generation as u32, + safekeepers: sks, + tenant_id, + timeline_id, + }) + } + + pub(crate) async fn tenant_timeline_create( + self: &Arc, + tenant_id: TenantId, + create_req: TimelineCreateRequest, + ) -> Result { + let safekeepers = self.config.timelines_onto_safekeepers; + tracing::info!( + %safekeepers, + "Creating timeline {}/{}", + tenant_id, + create_req.new_timeline_id, + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; + failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); + let create_mode = create_req.mode.clone(); + + let timeline_info = self + .tenant_timeline_create_pageservers(tenant_id, create_req) + .await?; + + let safekeepers = if safekeepers { + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info, create_mode) + .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) + .await?; + Some(res) + } else { + None + }; + + Ok(TimelineCreateResponseStorcon { + timeline_info, + safekeepers, + }) + } + pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, @@ -3576,13 +3966,15 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, req: TimelineArchivalConfigRequest, ) -> Result<(), ApiError> { tracing::info!( "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; client .timeline_archival_config(tenant_shard_id, timeline_id, &req) @@ -3605,6 +3997,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), req.clone(), )) }) @@ -3641,12 +4034,14 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { tracing::info!( "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; client .timeline_detach_ancestor(tenant_shard_id, timeline_id) @@ -3686,6 +4081,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), )) }) .await?; @@ -3738,9 +4134,16 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, dir: BlockUnblock, ) -> Result<(), ApiError> { - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + jwt.as_deref(), + ssl_ca_cert, + ) + .map_err(|e| passthrough_api_error(&node, e))?; client .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) @@ -3760,6 +4163,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), dir, )) }) @@ -3774,6 +4178,7 @@ impl Service { tenant_shard_id: TenantShardId, timeline_id: TimelineId, concurrency: Option, + recurse: bool, ) -> Result<(), ApiError> { let _tenant_lock = trace_shared_lock( &self.tenant_op_locks, @@ -3811,7 +4216,12 @@ impl Service { targets, |tenant_shard_id, client| async move { client - .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .timeline_download_heatmap_layers( + tenant_shard_id, + timeline_id, + concurrency, + recurse, + ) .await }, 1, @@ -3875,6 +4285,7 @@ impl Service { node.with_client_retries( |client| op(tenant_shard_id, client), &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, warn_threshold, max_retries, timeout, @@ -4071,7 +4482,7 @@ impl Service { } pub(crate) async fn tenant_timeline_delete( - &self, + self: &Arc, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { @@ -4083,7 +4494,7 @@ impl Service { ) .await; - self.tenant_remote_mutation(tenant_id, move |mut targets| async move { + let status_code = self.tenant_remote_mutation(tenant_id, move |mut targets| async move { if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), @@ -4098,12 +4509,14 @@ impl Service { timeline_id: TimelineId, node: Node, jwt: Option, + ssl_ca_cert: Option, ) -> Result { tracing::info!( "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref(), ssl_ca_cert) + .map_err(|e| passthrough_api_error(&node, e))?; let res = client .timeline_delete(tenant_shard_id, timeline_id) .await; @@ -4130,6 +4543,7 @@ impl Service { timeline_id, node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), )) }) .await?; @@ -4152,22 +4566,81 @@ impl Service { timeline_id, shard_zero_locations.latest.node, self.config.pageserver_jwt_token.clone(), + self.config.ssl_ca_cert.clone(), ) .await?; Ok(shard_zero_status) - }).await? + }).await?; + + self.tenant_timeline_delete_safekeepers(tenant_id, timeline_id) + .await?; + + status_code + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. + async fn tenant_timeline_delete_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + let tl = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(tl) = tl else { + tracing::info!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" + ); + return Ok(()); + }; + let all_sks = tl + .new_sk_set + .iter() + .flat_map(|sks| { + sks.iter() + .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) + }) + .chain( + tl.sk_set + .iter() + .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), + ) + .collect::>(); + + // Schedule reconciliations + { + let mut locked = self.inner.write().unwrap(); + for (sk_id, kind) in all_sks { + let sk_id = NodeId(sk_id as u64); + let Some(sk) = locked.safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {sk_id}" + ))); + }; + + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + // we don't use this for this kind, put a dummy value + host_list: Vec::new(), + tenant_id, + timeline_id, + generation: tl.generation as u32, + kind, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + Ok(()) } - /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, tenant_id: TenantId, ) -> Result<(Node, TenantShardId), ApiError> { - // Look up in-memory state and maybe use the node from there. - { + let tenant_shard_id = { let locked = self.inner.read().unwrap(); - let Some((tenant_shard_id, shard)) = locked + let Some((tenant_shard_id, _shard)) = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) .next() @@ -4177,6 +4650,29 @@ impl Service { )); }; + *tenant_shard_id + }; + + self.tenant_shard_node(tenant_shard_id) + .await + .map(|node| (node, tenant_shard_id)) + } + + /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this + /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound) + pub(crate) async fn tenant_shard_node( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + // Look up in-memory state and maybe use the node from there. + { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard {tenant_shard_id} not found").into(), + )); + }; + let Some(intent_node_id) = shard.intent.get_attached() else { tracing::warn!( tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), @@ -4197,7 +4693,7 @@ impl Service { "Shard refers to nonexistent node" ))); }; - return Ok((node.clone(), *tenant_shard_id)); + return Ok(node.clone()); } }; @@ -4205,29 +4701,34 @@ impl Service { // generation state: this will reflect the progress of any ongoing migration. // Note that it is not guaranteed to _stay_ here, our caller must still handle // the case where they call through to the pageserver and get a 404. - let db_result = self.persistence.tenant_generations(tenant_id).await?; + let db_result = self + .persistence + .tenant_generations(tenant_shard_id.tenant_id) + .await?; let Some(ShardGenerationState { - tenant_shard_id, + tenant_shard_id: _, generation: _, generation_pageserver: Some(node_id), - }) = db_result.first() + }) = db_result + .into_iter() + .find(|s| s.tenant_shard_id == tenant_shard_id) else { // This can happen if we raced with a tenant deletion or a shard split. On a retry // the caller will either succeed (shard split case), get a proper 404 (deletion case), // or a conflict response (case where tenant was detached in background) return Err(ApiError::ResourceUnavailable( - "Shard {} not found in database, or is not attached".into(), + format!("Shard {tenant_shard_id} not found in database, or is not attached").into(), )); }; let locked = self.inner.read().unwrap(); - let Some(node) = locked.nodes.get(node_id) else { + let Some(node) = locked.nodes.get(&node_id) else { // This should never happen return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard refers to nonexistent node" ))); }; - Ok((node.clone(), *tenant_shard_id)) + Ok(node.clone()) } pub(crate) fn tenant_locate( @@ -4327,7 +4828,7 @@ impl Service { is_reconciling: shard.reconciler.is_some(), is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), - scheduling_policy: *shard.get_scheduling_policy(), + scheduling_policy: shard.get_scheduling_policy(), preferred_az_id: shard.preferred_az().map(ToString::to_string), }) } @@ -4557,6 +5058,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 10, Duration::from_secs(5), @@ -5160,7 +5662,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(node, e))?; let response = client .tenant_shard_split( *parent_id, @@ -5258,12 +5762,93 @@ impl Service { Ok((response, waiters)) } + /// A graceful migration: update the preferred node and let optimisation handle the migration + /// in the background (may take a long time as it will fully warm up a location before cutting over) + /// + /// Our external API calls this a 'prewarm=true' migration, but internally it isn't a special prewarm step: it's + /// just a migration that uses the same graceful procedure as our background scheduling optimisations would use. + fn tenant_shard_migrate_with_prewarm( + &self, + migrate_req: &TenantShardMigrateRequest, + shard: &mut TenantShard, + scheduler: &mut Scheduler, + schedule_context: ScheduleContext, + ) -> Result, ApiError> { + shard.set_preferred_node(Some(migrate_req.node_id)); + + // Generate whatever the initial change to the intent is: this could be creation of a secondary, or + // cutting over to an existing secondary. Caller is responsible for validating this before applying it, + // e.g. by checking secondary is warm enough. + Ok(shard.optimize_attachment(scheduler, &schedule_context)) + } + + /// Immediate migration: directly update the intent state and kick off a reconciler + fn tenant_shard_migrate_immediate( + &self, + migrate_req: &TenantShardMigrateRequest, + nodes: &Arc>, + shard: &mut TenantShard, + scheduler: &mut Scheduler, + ) -> Result, ApiError> { + // Non-graceful migration: update the intent state immediately + let old_attached = *shard.intent.get_attached(); + match shard.policy { + PlacementPolicy::Attached(n) => { + // If our new attached node was a secondary, it no longer should be. + shard + .intent + .remove_secondary(scheduler, migrate_req.node_id); + + shard + .intent + .set_attached(scheduler, Some(migrate_req.node_id)); + + // If we were already attached to something, demote that to a secondary + if let Some(old_attached) = old_attached { + if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + + shard.intent.push_secondary(scheduler, old_attached); + } + } + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); + } + PlacementPolicy::Detached => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" + ))); + } + } + + tracing::info!("Migrating: new intent {:?}", shard.intent); + shard.sequence = shard.sequence.next(); + shard.set_preferred_node(None); // Abort any in-flight graceful migration + Ok(self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + )) + } + pub(crate) async fn tenant_shard_migrate( &self, tenant_shard_id: TenantShardId, migrate_req: TenantShardMigrateRequest, ) -> Result { - let waiter = { + // Depending on whether the migration is a change and whether it's graceful or immediate, we might + // get a different outcome to handle + enum MigrationOutcome { + Optimization(Option), + Reconcile(Option), + } + + let outcome = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -5274,71 +5859,139 @@ impl Service { ))); }; + // Migration to unavavailable node requires force flag if !node.is_available() { - // Warn but proceed: the caller may intend to manually adjust the placement of - // a shard even if the node is down, e.g. if intervening during an incident. - tracing::warn!("Migrating to unavailable node {node}"); + if migrate_req.migration_config.override_scheduler { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Forcibly migrating to unavailable node {node}"); + } else { + tracing::warn!("Node {node} is unavailable, refusing migration"); + return Err(ApiError::PreconditionFailed( + format!("Node {node} is unavailable").into_boxed_str(), + )); + } } + // Calculate the ScheduleContext for this tenant + let mut schedule_context = ScheduleContext::default(); + for (_shard_id, shard) in + tenants.range(TenantShardId::tenant_range(tenant_shard_id.tenant_id)) + { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + // Look up the specific shard we will migrate let Some(shard) = tenants.get_mut(&tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant shard not found").into(), )); }; + // Migration to a node with unfavorable scheduling score requires a force flag, because it might just + // be migrated back by the optimiser. + if let Some(better_node) = shard.find_better_location::( + scheduler, + &schedule_context, + migrate_req.node_id, + &[], + ) { + if !migrate_req.migration_config.override_scheduler { + return Err(ApiError::PreconditionFailed( + "Migration to a worse-scoring node".into(), + )); + } else { + tracing::info!( + "Migrating to a worse-scoring node {} (optimiser would prefer {better_node})", + migrate_req.node_id + ); + } + } + + if let Some(origin_node_id) = migrate_req.origin_node_id { + if shard.intent.get_attached() != &Some(origin_node_id) { + return Err(ApiError::PreconditionFailed( + format!( + "Migration expected to originate from {} but shard is on {:?}", + origin_node_id, + shard.intent.get_attached() + ) + .into(), + )); + } + } + if shard.intent.get_attached() == &Some(migrate_req.node_id) { // No-op case: we will still proceed to wait for reconciliation in case it is // incomplete from an earlier update to the intent. tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); + + // An instruction to migrate to the currently attached node should + // cancel any pending graceful migration + shard.set_preferred_node(None); + + MigrationOutcome::Reconcile(self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + )) + } else if migrate_req.migration_config.prewarm { + MigrationOutcome::Optimization(self.tenant_shard_migrate_with_prewarm( + &migrate_req, + shard, + scheduler, + schedule_context, + )?) } else { - let old_attached = *shard.intent.get_attached(); - - match shard.policy { - PlacementPolicy::Attached(n) => { - // If our new attached node was a secondary, it no longer should be. - shard - .intent - .remove_secondary(scheduler, migrate_req.node_id); - - shard - .intent - .set_attached(scheduler, Some(migrate_req.node_id)); - - // If we were already attached to something, demote that to a secondary - if let Some(old_attached) = old_attached { - if n > 0 { - // Remove other secondaries to make room for the location we'll demote - while shard.intent.get_secondary().len() >= n { - shard.intent.pop_secondary(scheduler); - } - - shard.intent.push_secondary(scheduler, old_attached); - } - } - } - PlacementPolicy::Secondary => { - shard.intent.clear(scheduler); - shard.intent.push_secondary(scheduler, migrate_req.node_id); - } - PlacementPolicy::Detached => { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))); - } - } - - tracing::info!("Migrating: new intent {:?}", shard.intent); - shard.sequence = shard.sequence.next(); + MigrationOutcome::Reconcile(self.tenant_shard_migrate_immediate( + &migrate_req, + nodes, + shard, + scheduler, + )?) } - - let reconciler_config = match migrate_req.migration_config { - Some(cfg) => (&cfg).into(), - None => ReconcilerConfig::new(ReconcilerPriority::High), - }; - - self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config) }; + // We may need to validate + apply an optimisation, or we may need to just retrive a reconcile waiter + let waiter = match outcome { + MigrationOutcome::Optimization(Some(optimization)) => { + // Validate and apply the optimization -- this would happen anyway in background reconcile loop, but + // we might as well do it more promptly as this is a direct external request. + let mut validated = self + .optimize_all_validate(vec![(tenant_shard_id, optimization)]) + .await; + if let Some((_shard_id, optimization)) = validated.pop() { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Rare but possible: tenant is removed between generating optimisation and validating it. + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if !shard.apply_optimization(scheduler, optimization) { + // This can happen but is unusual enough to warn on: something else changed in the shard that made the optimisation stale + // and therefore not applied. + tracing::warn!( + "Schedule optimisation generated during graceful migration was not applied, shard changed?" + ); + } + self.maybe_configured_reconcile_shard( + shard, + nodes, + (&migrate_req.migration_config).into(), + ) + } else { + None + } + } + MigrationOutcome::Optimization(None) => None, + MigrationOutcome::Reconcile(waiter) => waiter, + }; + + // Finally, wait for any reconcile we started to complete. In the case of immediate-mode migrations to cold + // locations, this has a good chance of timing out. if let Some(waiter) = waiter { waiter.wait_timeout(RECONCILE_TIMEOUT).await?; } else { @@ -5495,7 +6148,9 @@ impl Service { node.get_id(), node.base_url(), self.config.pageserver_jwt_token.as_deref(), - ); + self.config.ssl_ca_cert.clone(), + ) + .map_err(|e| passthrough_api_error(&node, e))?; let scan_result = client .tenant_scan_remote_storage(tenant_id) @@ -6918,6 +7573,10 @@ impl Service { ShardSchedulingPolicy::Active => { // Ok to do optimization } + ShardSchedulingPolicy::Essential if shard.get_preferred_node().is_some() => { + // Ok to do optimization: we are executing a graceful migration that + // has set preferred_node + } ShardSchedulingPolicy::Essential | ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { @@ -7133,6 +7792,7 @@ impl Service { .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7169,6 +7829,7 @@ impl Service { .await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7195,86 +7856,57 @@ impl Service { } } - /// Look for shards which are oversized and in need of splitting + /// Asynchronously split a tenant that's eligible for automatic splits: + /// + /// * The tenant is unsharded. + /// * The logical size of its largest timeline exceeds split_threshold. + /// * The tenant's scheduling policy is active. + /// + /// At most one tenant will be split per call: the one with the largest max logical size. It + /// will split 1 โ†’ 8 shards. + /// + /// TODO: consider splitting based on total logical size rather than max logical size. + /// + /// TODO: consider spawning multiple splits in parallel: this is only called once every 20 + /// seconds, so a large backlog can take a long time, and if a tenant fails to split it will + /// block all other splits. async fn autosplit_tenants(self: &Arc) { let Some(split_threshold) = self.config.split_threshold else { - // Auto-splitting is disabled + return; // auto-splits are disabled + }; + if split_threshold == 0 { return; - }; - - let nodes = self.inner.read().unwrap().nodes.clone(); - - const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); - - let mut top_n = Vec::new(); - - // Call into each node to look for big tenants - let top_n_request = TopTenantShardsRequest { - // We currently split based on logical size, for simplicity: logical size is a signal of - // the user's intent to run a large database, whereas physical/resident size can be symptoms - // of compaction issues. Eventually we should switch to using resident size to bound the - // disk space impact of one shard. - order_by: models::TenantSorting::MaxLogicalSize, - limit: 10, - where_shards_lt: Some(SPLIT_TO_MAX), - where_gt: Some(split_threshold), - }; - for node in nodes.values() { - let request_ref = &top_n_request; - match node - .with_client_retries( - |client| async move { - let request = request_ref.clone(); - client.top_tenant_shards(request.clone()).await - }, - &self.config.pageserver_jwt_token, - 3, - 3, - Duration::from_secs(5), - &self.cancel, - ) - .await - { - Some(Ok(node_top_n)) => { - top_n.extend(node_top_n.shards.into_iter()); - } - Some(Err(mgmt_api::Error::Cancelled)) => { - continue; - } - Some(Err(e)) => { - tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); - continue; - } - None => { - // Node is shutting down - continue; - } - }; } - // Pick the biggest tenant to split first - top_n.sort_by_key(|i| i.resident_size); + // Fetch the largest eligible shards by logical size. + const MAX_SHARDS: ShardCount = ShardCount::new(8); - // Filter out tenants in a prohibiting scheduling mode + let mut top_n = self + .get_top_tenant_shards(&TopTenantShardsRequest { + order_by: TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(MAX_SHARDS), + where_gt: Some(split_threshold), + }) + .await; + + // Filter out tenants in a prohibiting scheduling mode. { - let locked = self.inner.read().unwrap(); + let state = self.inner.read().unwrap(); top_n.retain(|i| { - if let Some(shard) = locked.tenants.get(&i.id) { - matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) - } else { - false - } + let policy = state.tenants.get(&i.id).map(|s| s.get_scheduling_policy()); + policy == Some(ShardSchedulingPolicy::Active) }); } let Some(split_candidate) = top_n.into_iter().next() else { - tracing::debug!("No split-elegible shards found"); + debug!("No split-elegible shards found"); return; }; - // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't - // want to block the background reconcile loop on this. - tracing::info!( + // We spawn a task to run this, so it's exactly like some external API client requesting it. + // We don't want to block the background reconcile loop on this. + info!( "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" ); @@ -7285,29 +7917,71 @@ impl Service { .tenant_shard_split( split_candidate.id.tenant_id, TenantShardSplitRequest { - // Always split to the max number of shards: this avoids stepping through - // intervening shard counts and encountering the overrhead of a split+cleanup - // each time as a tenant grows, and is not too expensive because our max shard - // count is relatively low anyway. - // This policy will be adjusted in future once we support higher shard count. - new_shard_count: SPLIT_TO_MAX.literal(), + // Always split to the max number of shards: this avoids stepping + // through intervening shard counts and encountering the overhead of a + // split+cleanup each time as a tenant grows, and is not too expensive + // because our max shard count is relatively low anyway. This policy + // will be adjusted in future once we support higher shard count. + new_shard_count: MAX_SHARDS.literal(), new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), }, ) .await { - Ok(_) => { - tracing::info!("Successful auto-split"); - } - Err(e) => { - tracing::error!("Auto-split failed: {e}"); - } + Ok(_) => info!("Successful auto-split"), + Err(err) => error!("Auto-split failed: {err}"), } } - .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + .instrument(info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), ); } + /// Fetches the top tenant shards from every node, in descending order of + /// max logical size. Any node errors will be logged and ignored. + async fn get_top_tenant_shards( + &self, + request: &TopTenantShardsRequest, + ) -> Vec { + let nodes = self + .inner + .read() + .unwrap() + .nodes + .values() + .cloned() + .collect_vec(); + + let mut futures = FuturesUnordered::new(); + for node in nodes { + futures.push(async move { + node.with_client_retries( + |client| async move { client.top_tenant_shards(request.clone()).await }, + &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + }); + } + + let mut top = Vec::new(); + while let Some(output) = futures.next().await { + match output { + Some(Ok(response)) => top.extend(response.shards), + Some(Err(mgmt_api::Error::Cancelled)) => {} + Some(Err(err)) => warn!("failed to fetch top tenants: {err}"), + None => {} // node is shutting down + } + } + + top.sort_by_key(|i| i.max_logical_size); + top.reverse(); + top + } + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should /// put the system into a quiescent state where future background reconciliations won't do anything. @@ -7403,6 +8077,7 @@ impl Service { .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, 1, 3, Duration::from_millis(250), @@ -7991,6 +8666,68 @@ impl Service { global_observed } + /// Choose safekeepers for the new timeline: 3 in different azs. + pub(crate) async fn safekeepers_for_new_timeline( + &self, + ) -> Result, ApiError> { + let mut all_safekeepers = { + let locked = self.inner.read().unwrap(); + locked + .safekeepers + .iter() + .filter_map(|sk| { + if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { + // If we don't want to schedule stuff onto the safekeeper, respect that. + return None; + } + let utilization_opt = if let SafekeeperState::Available { + last_seen_at: _, + utilization, + } = sk.1.availability() + { + Some(utilization) + } else { + // non-available safekeepers still get a chance for new timelines, + // but put them last in the list. + None + }; + let info = SafekeeperInfo { + hostname: sk.1.skp.host.clone(), + id: NodeId(sk.1.skp.id as u64), + }; + Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) + }) + .collect::>() + }; + all_safekeepers.sort_by_key(|sk| { + ( + sk.0.as_ref() + .map(|ut| ut.timeline_count) + .unwrap_or(u64::MAX), + // Use the id to decide on equal scores for reliability + sk.1.id.0, + ) + }); + let mut sks = Vec::new(); + let mut azs = HashSet::new(); + for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { + if !azs.insert(az_id) { + continue; + } + sks.push(sk_info.clone()); + if sks.len() == 3 { + break; + } + } + if sks.len() == 3 { + Ok(sks) + } else { + Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find three safekeepers in different AZs for new timeline" + ))) + } + } + pub(crate) async fn safekeepers_list( &self, ) -> Result, DatabaseError> { @@ -8019,24 +8756,41 @@ impl Service { pub(crate) async fn upsert_safekeeper( &self, record: crate::persistence::SafekeeperUpsert, - ) -> Result<(), DatabaseError> { + ) -> Result<(), ApiError> { let node_id = NodeId(record.id as u64); + let use_https = self.config.use_https_safekeeper_api; + + if use_https && record.https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "cannot upsert safekeeper {node_id}: \ + https is enabled, but https port is not specified" + ) + .into(), + )); + } + self.persistence.safekeeper_upsert(record.clone()).await?; { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); match safekeepers.entry(node_id) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - entry.get_mut().update_from_record(record); - } + std::collections::hash_map::Entry::Occupied(mut entry) => entry + .get_mut() + .update_from_record(record) + .expect("all preconditions should be checked before upsert to database"), std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(Safekeeper::from_persistence( - crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::Pause, - ), - CancellationToken::new(), - )); + entry.insert( + Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + use_https, + ) + .expect("all preconditions should be checked before upsert to database"), + ); } } locked.safekeepers = Arc::new(safekeepers); @@ -8062,6 +8816,13 @@ impl Service { .ok_or(DatabaseError::Logical("Not found".to_string()))?; sk.set_scheduling_policy(scheduling_policy); + match scheduling_policy { + SkSchedulingPolicy::Active => (), + SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { + locked.safekeeper_reconcilers.cancel_safekeeper(node_id); + } + } + locked.safekeepers = Arc::new(safekeepers); } Ok(()) @@ -8085,10 +8846,11 @@ impl Service { let mut updated_in_mem_and_db = Vec::default(); let mut locked = self.inner.write().unwrap(); + let state = locked.deref_mut(); for (tid, az_id) in updated { - let shard = locked.tenants.get_mut(&tid); + let shard = state.tenants.get_mut(&tid); if let Some(shard) = shard { - shard.set_preferred_az(az_id); + shard.set_preferred_az(&mut state.scheduler, az_id); updated_in_mem_and_db.push(tid); } } diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 2ff68d7037..a0419e0205 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -46,48 +46,51 @@ impl ChaosInjector { } } + fn get_cron_interval_sleep_future(&self) -> Option { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } + } + } else { + None + } + } + pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - let cron_interval = { - if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { - match cron_to_next_duration(chaos_exit_crontab) { - Ok(interval_exit) => Some(interval_exit), - Err(e) => { - tracing::error!("Error processing the cron schedule: {e}"); - None - } - } - } else { - None - } - }; + #[derive(Debug)] enum ChaosEvent { ShuffleTenant, ForceKill, } - let chaos_type = tokio::select! { - _ = interval.tick() => { - ChaosEvent::ShuffleTenant - } - Some(_) = maybe_sleep(cron_interval) => { - ChaosEvent::ForceKill - } - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; - } - }; - - match chaos_type { - ChaosEvent::ShuffleTenant => { - self.inject_chaos().await; - } - ChaosEvent::ForceKill => { - self.force_kill().await; + loop { + let cron_interval = self.get_cron_interval_sleep_future(); + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + tracing::info!("Chaos iteration: {chaos_type:?}..."); + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } } } - - tracing::info!("Chaos iteration..."); } /// If a shard has a secondary and attached location, then re-assign the secondary to be diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs new file mode 100644 index 0000000000..4fa465c307 --- /dev/null +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -0,0 +1,340 @@ +use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; + +use clashmap::{ClashMap, Entry}; +use safekeeper_api::models::PullTimelineRequest; +use safekeeper_client::mgmt_api; +use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + logging::SecretString, +}; + +use crate::{ + persistence::SafekeeperTimelineOpKind, safekeeper::Safekeeper, + safekeeper_client::SafekeeperClient, +}; + +use super::Service; + +pub(crate) struct SafekeeperReconcilers { + cancel: CancellationToken, + reconcilers: HashMap, +} + +impl SafekeeperReconcilers { + pub fn new(cancel: CancellationToken) -> Self { + SafekeeperReconcilers { + cancel, + reconcilers: HashMap::new(), + } + } + pub(crate) fn schedule_request_vec( + &mut self, + service: &Arc, + reqs: Vec, + ) { + for req in reqs { + self.schedule_request(service, req); + } + } + pub(crate) fn schedule_request(&mut self, service: &Arc, req: ScheduleRequest) { + let node_id = req.safekeeper.get_id(); + let reconciler_handle = self.reconcilers.entry(node_id).or_insert_with(|| { + SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone()) + }); + reconciler_handle.schedule_reconcile(req); + } + pub(crate) fn cancel_safekeeper(&mut self, node_id: NodeId) { + if let Some(handle) = self.reconcilers.remove(&node_id) { + handle.cancel.cancel(); + } + } +} + +/// Initial load of the pending operations from the db +pub(crate) async fn load_schedule_requests( + service: &Arc, + safekeepers: &HashMap, +) -> anyhow::Result> { + let pending_ops = service.persistence.list_pending_ops(None).await?; + let mut res = Vec::with_capacity(pending_ops.len()); + for op_persist in pending_ops { + let node_id = NodeId(op_persist.sk_id as u64); + let Some(sk) = safekeepers.get(&node_id) else { + // This shouldn't happen, at least the safekeeper should exist as decomissioned. + tracing::warn!( + tenant_id = op_persist.tenant_id, + timeline_id = op_persist.timeline_id, + "couldn't find safekeeper with pending op id {node_id} in list of stored safekeepers" + ); + continue; + }; + let sk = Box::new(sk.clone()); + let tenant_id = TenantId::from_str(&op_persist.tenant_id)?; + let timeline_id = TimelineId::from_str(&op_persist.timeline_id)?; + let host_list = match op_persist.op_kind { + SafekeeperTimelineOpKind::Delete => Vec::new(), + SafekeeperTimelineOpKind::Exclude => Vec::new(), + SafekeeperTimelineOpKind::Pull => { + // TODO this code is super hacky, it doesn't take migrations into account + let timeline_persist = service + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(timeline_persist) = timeline_persist else { + // This shouldn't happen, the timeline should still exist + tracing::warn!( + tenant_id = op_persist.tenant_id, + timeline_id = op_persist.timeline_id, + "couldn't find timeline for corresponding pull op" + ); + continue; + }; + timeline_persist + .sk_set + .iter() + .filter_map(|sk_id| { + let other_node_id = NodeId(*sk_id as u64); + if node_id == other_node_id { + // We obviously don't want to pull from ourselves + return None; + } + let Some(sk) = safekeepers.get(&other_node_id) else { + tracing::warn!( + "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it" + ); + return None; + }; + Some((other_node_id, sk.base_url())) + }) + .collect::>() + } + }; + let req = ScheduleRequest { + safekeeper: sk, + host_list, + tenant_id, + timeline_id, + generation: op_persist.generation as u32, + kind: op_persist.op_kind, + }; + res.push(req); + } + Ok(res) +} + +pub(crate) struct ScheduleRequest { + pub(crate) safekeeper: Box, + pub(crate) host_list: Vec<(NodeId, String)>, + pub(crate) tenant_id: TenantId, + pub(crate) timeline_id: TimelineId, + pub(crate) generation: u32, + pub(crate) kind: SafekeeperTimelineOpKind, +} + +struct ReconcilerHandle { + tx: UnboundedSender<(ScheduleRequest, Arc)>, + ongoing_tokens: Arc>>, + cancel: CancellationToken, +} + +impl ReconcilerHandle { + /// Obtain a new token slot, cancelling any existing reconciliations for that timeline + fn new_token_slot( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Arc { + let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); + if let Entry::Occupied(entry) = &entry { + let cancel: &CancellationToken = entry.get(); + cancel.cancel(); + } + entry.insert(Arc::new(self.cancel.child_token())).clone() + } + fn schedule_reconcile(&self, req: ScheduleRequest) { + let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let hostname = req.safekeeper.skp.host.clone(); + if let Err(err) = self.tx.send((req, cancel)) { + tracing::info!("scheduling request onto {hostname} returned error: {err}"); + } + } +} + +pub(crate) struct SafekeeperReconciler { + service: Arc, + rx: UnboundedReceiver<(ScheduleRequest, Arc)>, + cancel: CancellationToken, +} + +impl SafekeeperReconciler { + fn spawn(cancel: CancellationToken, service: Arc) -> ReconcilerHandle { + // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. + let (tx, rx) = mpsc::unbounded_channel(); + let mut reconciler = SafekeeperReconciler { + service, + rx, + cancel: cancel.clone(), + }; + let handle = ReconcilerHandle { + tx, + ongoing_tokens: Arc::new(ClashMap::new()), + cancel, + }; + tokio::spawn(async move { reconciler.run().await }); + handle + } + async fn run(&mut self) { + loop { + // TODO add parallelism with semaphore here + let req = tokio::select! { + req = self.rx.recv() => req, + _ = self.cancel.cancelled() => break, + }; + let Some((req, req_cancel)) = req else { break }; + if req_cancel.is_cancelled() { + continue; + } + + let kind = req.kind; + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_one(req, req_cancel) + .instrument(tracing::info_span!( + "reconcile_one", + ?kind, + %tenant_id, + %timeline_id + )) + .await; + } + } + async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc) { + let req_host = req.safekeeper.skp.host.clone(); + match req.kind { + SafekeeperTimelineOpKind::Pull => { + let our_id = req.safekeeper.get_id(); + let http_hosts = req + .host_list + .iter() + .filter(|(node_id, _hostname)| *node_id != our_id) + .map(|(_, hostname)| hostname.clone()) + .collect::>(); + let pull_req = PullTimelineRequest { + http_hosts, + tenant_id: req.tenant_id, + timeline_id: req.timeline_id, + }; + self.reconcile_inner( + req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + tracing::info!( + "pulled timeline from {} onto {req_host}", + resp.safekeeper_host, + ); + }, + req_cancel, + ) + .await; + } + SafekeeperTimelineOpKind::Exclude => { + // TODO actually exclude instead of delete here + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_inner( + req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; + } + SafekeeperTimelineOpKind::Delete => { + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + self.reconcile_inner( + req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; + } + } + } + async fn reconcile_inner( + &self, + req: ScheduleRequest, + closure: impl Fn(SafekeeperClient) -> F, + log_success: impl FnOnce(T) -> U, + req_cancel: Arc, + ) where + F: Future>, + { + let jwt = self + .service + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let ssl_ca_cert = self.service.config.ssl_ca_cert.clone(); + loop { + let res = req + .safekeeper + .with_client_retries( + |client| { + let closure = &closure; + async move { closure(client).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 10, + Duration::from_secs(10), + &req_cancel, + ) + .await; + match res { + Ok(resp) => { + log_success(resp); + let res = self + .service + .persistence + .remove_pending_op( + req.tenant_id, + req.timeline_id, + req.safekeeper.get_id(), + req.generation, + ) + .await; + if let Err(err) = res { + tracing::info!( + "couldn't remove reconciliation request onto {} from persistence: {err:?}", + req.safekeeper.skp.host + ); + } + return; + } + Err(mgmt_api::Error::Cancelled) => { + // On cancellation, the code that issued it will take care of removing db entries (if needed) + return; + } + Err(e) => { + tracing::info!( + "Reconcile attempt for safekeeper {} failed, retrying after sleep: {e:?}", + req.safekeeper.skp.host + ); + const SLEEP_TIME: Duration = Duration::from_secs(1); + tokio::time::sleep(SLEEP_TIME).await; + } + } + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 34fd244023..80f42e04a9 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -132,6 +132,10 @@ pub(crate) struct TenantShard { /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, + /// To do a graceful migration, set this field to the destination pageserver, and optimization + /// functions will consider this node the best location and react appropriately. + preferred_node: Option, + // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, @@ -324,6 +328,37 @@ impl IntentState { false } } + + pub(crate) fn set_preferred_az( + &mut self, + scheduler: &mut Scheduler, + preferred_az: Option, + ) { + let new_az = preferred_az.as_ref(); + let old_az = self.preferred_az_id.as_ref(); + + if old_az != new_az { + if let Some(node_id) = self.attached { + scheduler.update_node_ref_counts( + node_id, + new_az, + RefCountUpdate::ChangePreferredAzFrom(old_az), + ); + } + for node_id in &self.secondary { + scheduler.update_node_ref_counts( + *node_id, + new_az, + RefCountUpdate::ChangePreferredAzFrom(old_az), + ); + } + self.preferred_az_id = preferred_az; + } + } + + pub(crate) fn get_preferred_az(&self) -> Option<&AvailabilityZone> { + self.preferred_az_id.as_ref() + } } impl Drop for IntentState { @@ -555,6 +590,7 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), + preferred_node: None, } } @@ -809,6 +845,15 @@ impl TenantShard { return None; }; + // If the candidate is our preferred node, then it is better than the current location, as long + // as it is online -- the online check is part of the score calculation we did above, so it's + // important that this check comes after that one. + if let Some(preferred) = self.preferred_node.as_ref() { + if preferred == &candidate { + return Some(true); + } + } + match scheduler.compute_node_score::( current, &self.intent.preferred_az_id, @@ -847,13 +892,22 @@ impl TenantShard { } } - fn find_better_location( + pub(crate) fn find_better_location( &self, scheduler: &mut Scheduler, schedule_context: &ScheduleContext, current: NodeId, hard_exclude: &[NodeId], ) -> Option { + // If we have a migration hint, then that is our better location + if let Some(hint) = self.preferred_node.as_ref() { + if hint == ¤t { + return None; + } + + return Some(*hint); + } + // Look for a lower-scoring location to attach to let Ok(candidate_node) = scheduler.schedule_shard::( hard_exclude, @@ -887,6 +941,13 @@ impl TenantShard { scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> bool { + // Tenant with preferred node: check if it is not already at the preferred node + if let Some(preferred) = self.preferred_node.as_ref() { + if Some(preferred) != self.intent.get_attached().as_ref() { + return true; + } + } + // Sharded tenant: check if any locations have a nonzero affinity score if self.shard.count >= ShardCount(1) { let schedule_context = schedule_context.project_detach(self); @@ -927,6 +988,9 @@ impl TenantShard { /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. + /// + /// `schedule_context` should have been populated with all shards in the tenant, including + /// the one we're trying to optimize (this function will subtract its own contribution before making scoring decisions) #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, @@ -1055,7 +1119,8 @@ impl TenantShard { // // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes // there are too overloaded for scheduler to suggest them, more should be provisioned eventually). - if self.intent.preferred_az_id.is_some() + if self.preferred_node.is_none() + && self.intent.preferred_az_id.is_some() && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id { tracing::debug!( @@ -1161,6 +1226,27 @@ impl TenantShard { None } + /// Start or abort a graceful migration of this shard to another pageserver. This works on top of the + /// other optimisation functions, to bias them to move to the destination node. + pub(crate) fn set_preferred_node(&mut self, node: Option) { + if let Some(hint) = self.preferred_node.as_ref() { + if Some(hint) != node.as_ref() { + // This is legal but a bit surprising: we expect that administrators wouldn't usually + // change their mind about where to migrate something. + tracing::warn!( + "Changing migration destination from {hint} to {node:?} (current intent {:?})", + self.intent + ); + } + } + + self.preferred_node = node; + } + + pub(crate) fn get_preferred_node(&self) -> Option { + self.preferred_node + } + /// Return true if the optimization was really applied: it will not be applied if the optimization's /// sequence is behind this tenant shard's pub(crate) fn apply_optimization( @@ -1185,6 +1271,14 @@ impl TenantShard { self.intent.demote_attached(scheduler, old_attached_node_id); self.intent .promote_attached(scheduler, new_attached_node_id); + + if let Some(hint) = self.preferred_node.as_ref() { + if hint == &new_attached_node_id { + // The migration target is not a long term pin: once we are done with the migration, clear it. + tracing::info!("Graceful migration to {hint} complete"); + self.preferred_node = None; + } + } } ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id, @@ -1703,6 +1797,10 @@ impl TenantShard { debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + if self.preferred_node == Some(node_id) { + self.preferred_node = None; + } + intent_modified } @@ -1710,8 +1808,8 @@ impl TenantShard { self.scheduling_policy = p; } - pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { - &self.scheduling_policy + pub(crate) fn get_scheduling_policy(&self) -> ShardSchedulingPolicy { + self.scheduling_policy } pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { @@ -1750,6 +1848,7 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + preferred_node: None, }) } @@ -1770,11 +1869,15 @@ impl TenantShard { } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { - self.intent.preferred_az_id.as_ref() + self.intent.get_preferred_az() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { - self.intent.preferred_az_id = preferred_az_id; + pub(crate) fn set_preferred_az( + &mut self, + scheduler: &mut Scheduler, + preferred_az_id: Option, + ) { + self.intent.set_preferred_az(scheduler, preferred_az_id); } /// Returns all the nodes to which this tenant shard is attached according to the @@ -2270,6 +2373,85 @@ pub(crate) mod tests { Ok(()) } + #[test] + /// How the optimisation code handles a shard with a preferred node set; this is an example + /// of the multi-step migration, but driven by a different input. + fn optimize_attachment_multi_preferred_node() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 4, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-b".to_string()), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Two shards of a tenant that wants to be in AZ A + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + + // Initially attached in a stable location + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + + // Set the preferred node to node 2, an equally high scoring node to its current location + shard_a.preferred_node = Some(NodeId(2)); + + fn make_schedule_context(shard_a: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context + } + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(2)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + // The first step of the optimisation should not have cleared the preferred node + assert_eq!(shard_a.preferred_node, Some(NodeId(2))); + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + // The cutover step of the optimisation should have cleared the preferred node + assert_eq!(shard_a.preferred_node, None); + + let schedule_context = make_schedule_context(&shard_a); + let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + shard_a.intent.clear(&mut scheduler); + + Ok(()) + } + #[test] /// Check that multi-step migration works when moving to somewhere that is only better by /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index cdc162fca2..9b28246f58 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -53,6 +53,18 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.text + # Current compute status. + def status(self): + res = self.get(f"http://localhost:{self.external_port}/status") + res.raise_for_status() + return res.json() + + # Compute startup-related metrics. + def metrics_json(self): + res = self.get(f"http://localhost:{self.external_port}/metrics.json") + res.raise_for_status() + return res.json() + def configure_failpoints(self, *args: tuple[str, str]) -> None: body: list[dict[str, str]] = [] diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 97a5a36814..6e53987e7c 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -525,12 +525,14 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, + safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, remote_ext_config: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: args = [ @@ -543,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli): if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) + if safekeepers_generation is not None: + args.extend(["--safekeepers-generation", str(safekeepers_generation)]) if safekeepers is not None: args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) if endpoint_id is not None: @@ -553,6 +557,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--allow-multiple"]) if create_test_user: args.extend(["--create-test-user"]) + if timeout is not None: + args.extend(["--start-timeout", str(timeout)]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d282971b1..7bc746d668 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -253,10 +253,15 @@ class PgProtocol: # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. + # pooler does not support statement_timeout + # Check if the hostname contains the string 'pooler' + hostname = result.get("host", "") + log.info(f"Hostname: {hostname}") options = result.get("options", "") - if "statement_timeout" not in options: + if "statement_timeout" not in options and "pooler" not in hostname: options = f"-cstatement_timeout=120s {options}" result["options"] = options + return result # autocommit=True here by default because that's what we need most of the time @@ -458,6 +463,10 @@ class NeonEnvBuilder: self.control_plane_compute_hook_api: str | None = None self.storage_controller_config: dict[Any, Any] | None = None + # Flag to enable https listener in pageserver, generate local ssl certs, + # and force storage controller to use https for pageserver api. + self.use_https_pageserver_api: bool = False + self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine self.pageserver_get_vectored_concurrent_io: str | None = ( pageserver_get_vectored_concurrent_io @@ -1054,6 +1063,11 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline + self.generate_local_ssl_certs = config.use_https_pageserver_api + self.ssl_ca_file = ( + self.repo_dir.joinpath("rootCA.crt") if self.generate_local_ssl_certs else None + ) + neon_local_env_vars = {} if self.rust_log_override is not None: neon_local_env_vars["RUST_LOG"] = self.rust_log_override @@ -1117,6 +1131,7 @@ class NeonEnv: }, "safekeepers": [], "pageservers": [], + "generate_local_ssl_certs": self.generate_local_ssl_certs, } if self.control_plane_api is not None: @@ -1125,8 +1140,14 @@ class NeonEnv: if self.control_plane_compute_hook_api is not None: cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api - if self.storage_controller_config is not None: - cfg["storage_controller"] = self.storage_controller_config + storage_controller_config = self.storage_controller_config + + if config.use_https_pageserver_api: + storage_controller_config = storage_controller_config or {} + storage_controller_config["use_https_pageserver_api"] = True + + if storage_controller_config is not None: + cfg["storage_controller"] = storage_controller_config # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1137,6 +1158,7 @@ class NeonEnv: pageserver_port = PageserverPort( pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=self.port_distributor.get_port() if config.use_https_pageserver_api else None, ) # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override` @@ -1151,12 +1173,17 @@ class NeonEnv: "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", "listen_http_addr": f"localhost:{pageserver_port.http}", + "listen_https_addr": f"localhost:{pageserver_port.https}" + if config.use_https_pageserver_api + else None, "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, + # Look for gaps in WAL received from safekeepeers + "validate_wal_contiguity": True, } # Batching (https://github.com/neondatabase/neon/issues/9377): @@ -1167,14 +1194,6 @@ class NeonEnv: "max_batch_size": 32, } - if config.test_may_use_compatibility_snapshot_binaries: - log.info( - "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" - ) - else: - # Look for gaps in WAL received from safekeepeers - ps_cfg["validate_wal_contiguity"] = True - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { @@ -1189,6 +1208,9 @@ class NeonEnv: config.pageserver_default_tenant_config_compaction_algorithm ) + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( self.pageserver_remote_storage @@ -1713,8 +1735,12 @@ class StorageControllerLeadershipStatus(StrEnum): @dataclass class StorageControllerMigrationConfig: - secondary_warmup_timeout: str | None - secondary_download_request_timeout: str | None + # Unlike the API itself, tests default to prewarm=False because it's a simpler API and doesn't + # require the test to go poll for the migration actually completing. + prewarm: bool = False + override_scheduler: bool = False + secondary_warmup_timeout: str | None = None + secondary_download_request_timeout: str | None = None class NeonStorageController(MetricsGetter, LogUtils): @@ -2118,8 +2144,10 @@ class NeonStorageController(MetricsGetter, LogUtils): config: StorageControllerMigrationConfig | None = None, ): payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id} - if config is not None: - payload["migration_config"] = dataclasses.asdict(config) + if config is None: + config = StorageControllerMigrationConfig() + + payload["migration_config"] = dataclasses.asdict(config) self.request( "PUT", @@ -2127,8 +2155,13 @@ class NeonStorageController(MetricsGetter, LogUtils): json=payload, headers=self.headers(TokenScope.ADMIN), ) - log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") - assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id + if config.prewarm: + log.info( + f"Started prewarm migration of tenant {tenant_shard_id} to pageserver {dest_ps_id}" + ) + else: + log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") + assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): log.info(f"tenant_policy_update({tenant_id}, {body})") @@ -2469,12 +2502,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + def download_heatmap_layers( + self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, recurse: bool | None = None + ): + url = ( + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers" + ) + if recurse is not None: + url = url + f"?recurse={str(recurse).lower()}" + response = self.request( "POST", - f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + url, headers=self.headers(TokenScope.ADMIN), ) + response.raise_for_status() def __enter__(self) -> Self: @@ -3592,6 +3634,7 @@ class NeonProxy(PgProtocol): "project_id": "test_project_id", "endpoint_id": "test_endpoint_id", "branch_id": "test_branch_id", + "compute_id": "test_compute_id", }, } }, @@ -3817,6 +3860,7 @@ def static_auth_broker( { "address": local_proxy_addr, "aux": { + "compute_id": "compute-foo-bar-1234-5678", "endpoint_id": "ep-foo-bar-1234", "branch_id": "br-foo-bar", "project_id": "foo-bar", @@ -3987,10 +4031,12 @@ class Endpoint(PgProtocol, LogUtils): self, remote_ext_config: str | None = None, pageserver_id: int | None = None, + safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, allow_multiple: bool = False, create_test_user: bool = False, basebackup_request_tries: int | None = None, + timeout: str | None = None, env: dict[str, str] | None = None, ) -> Self: """ @@ -4000,19 +4046,21 @@ class Endpoint(PgProtocol, LogUtils): assert self.endpoint_id is not None - # If `safekeepers` is not None, they are remember them as active and use - # in the following commands. + # If `safekeepers` is not None, remember them as active and use in the + # following commands. if safekeepers is not None: self.active_safekeepers = safekeepers self.env.neon_cli.endpoint_start( self.endpoint_id, + safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, basebackup_request_tries=basebackup_request_tries, + timeout=timeout, env=env, ) self._running.release(1) @@ -4523,33 +4571,6 @@ class Safekeeper(LogUtils): for na in not_allowed: assert not self.log_contains(na) - def append_logical_message( - self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any] - ) -> dict[str, Any]: - """ - Send JSON_CTRL query to append LogicalMessage to WAL and modify - safekeeper state. It will construct LogicalMessage from provided - prefix and message, and then will write it to WAL. - """ - - # "replication=0" hacks psycopg not to send additional queries - # on startup, see https://github.com/psycopg/psycopg2/pull/482 - token = self.env.auth_keys.generate_tenant_token(tenant_id) - connstr = f"host=localhost port={self.port.pg} password={token} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" - - with closing(psycopg2.connect(connstr)) as conn: - # server doesn't support transactions - conn.autocommit = True - with conn.cursor() as cur: - request_json = json.dumps(request) - log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") - cur.execute("JSON_CTRL " + request_json) - all = cur.fetchall() - log.info(f"JSON_CTRL response: {all[0][0]}") - res = json.loads(all[0][0]) - assert isinstance(res, dict) - return res - def http_client( self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 748ac0d569..abddfa2768 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -94,7 +94,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*Flushed oversized open layer with size.*", # During teardown, we stop the storage controller before the pageservers, so pageservers # can experience connection errors doing background deletion queue work. - ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + ".*WARN deletion backend:.* storage controller upcall failed, will retry.*error sending request.*", + # Can happen when the pageserver starts faster than the storage controller + ".*WARN init_tenant_mgr:.* storage controller upcall failed, will retry.*error sending request.*", # Can happen when the test shuts down the storage controller while it is calling the utilization API ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown @@ -122,6 +124,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # controller's attempts to notify the endpoint). ".*reconciler.*neon_local notification hook failed.*", ".*reconciler.*neon_local error.*", + # Tenant rate limits may fire in tests that submit lots of API requests. + ".*tenant \\S+ is rate limited.*", ] diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 364aff325d..0efe0b9575 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def timeline_patch_index_part( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + data: dict[str, Any], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part", + json=data, + ) + self.verbose_error(res) + return res.json() + def tenant_location_conf( self, tenant_id: TenantId | TenantShardId, diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4df2b2df2b..cac84c07e7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,6 +282,17 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def safekeeper_tenants_path(self) -> str: + return f"{self.prefix_in_bucket}" + + def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: + return f"{self.safekeeper_tenants_path()}/{tenant_id}" + + def safekeeper_timeline_path( + self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId + ) -> str: + return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: """ Gets the latest generation key from a list of keys. diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 493ce7334e..e409151b76 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): # only_local doesn't remove segments in the remote storage. def timeline_delete( - self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs ) -> dict[Any, Any]: res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params={ "only_local": str(only_local).lower(), }, + **kwargs, ) res.raise_for_status() res_json = res.json() @@ -273,10 +274,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def timeline_exclude( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> dict[str, Any]: + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", + data=to.to_json(), + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def membership_switch( self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration ) -> TimelineMembershipSwitchResponse: - res = self.post( + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", data=to.to_json(), ) diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py new file mode 100644 index 0000000000..158baf7bb6 --- /dev/null +++ b/test_runner/fixtures/safekeeper_utils.py @@ -0,0 +1,92 @@ +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import get_dir_size + + +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end + + +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb + + +def wait_lsn_force_checkpoint( + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint: Endpoint, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ + pageserver_conn_options = pageserver_conn_options or {} + + auth_token = None + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] + + # wait for the pageserver to catch up + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + # force checkpoint to advance remote_consistent_lsn + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) + + # ensure that remote_consistent_lsn is advanced + wait_for_upload( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 84d62fb877..d1b2a5a400 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -337,6 +337,8 @@ def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, e """ # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) + # Remove "-pooler" suffix if present + endpoint_id = endpoint_id.removesuffix("-pooler") params = { "orgId": 1, diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql new file mode 100644 index 0000000000..69e6366a53 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql @@ -0,0 +1,47 @@ +\set event_type random(1,10) +\set service_key random(1, 3) + +INSERT INTO webhook.incoming_webhooks ( + created_at, + delivery_id, + upstream_emitted_at, + service_key, + event_id, + source, + body, + json, + additional_data, + is_body_encrypted, + event_type +) VALUES ( + now(), + gen_random_uuid(), + now() - interval '10 minutes', + CASE :service_key::int + WHEN 1 THEN 'shopify' + WHEN 2 THEN 'stripe' + WHEN 3 THEN 'github' + END, + 'evt_' || gen_random_uuid(), -- Ensures uniqueness + CASE :service_key::int + WHEN 1 THEN 'Shopify' + WHEN 2 THEN 'Stripe' + WHEN 3 THEN 'GitHub' + END, + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}', + '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb, + '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb, + false, + CASE :event_type::int + WHEN 1 THEN 'ORDER_PLACED' + WHEN 2 THEN 'ORDER_CANCELLED' + WHEN 3 THEN 'PAYMENT_SUCCESSFUL' + WHEN 4 THEN 'PAYMENT_FAILED' + WHEN 5 THEN 'CUSTOMER_CREATED' + WHEN 6 THEN 'CUSTOMER_UPDATED' + WHEN 7 THEN 'PRODUCT_UPDATED' + WHEN 8 THEN 'INVENTORY_LOW' + WHEN 9 THEN 'SHIPPING_DISPATCHED' + WHEN 10 THEN 'REFUND_ISSUED' + END +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql new file mode 100644 index 0000000000..b2f173f011 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql @@ -0,0 +1,15 @@ +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed. + +\set alpha 1.2 +\set min_id 1 +\set max_id 135000000 + +\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha) + +SELECT * +FROM webhook.incoming_webhooks +WHERE id = (:zipf_random_id)::bigint +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql new file mode 100644 index 0000000000..78a843bf0f --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql @@ -0,0 +1,9 @@ +-- select one of the most recent webhook records (created in the branch timeline during the bench run) +SELECT * +FROM webhook.incoming_webhooks +WHERE id = ( + SELECT (floor(random() * ( + (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1 + ) + 1350000001))::bigint +) +LIMIT 1; \ No newline at end of file diff --git a/test_runner/performance/test_compute_ctl_api.py b/test_runner/performance/test_compute_ctl_api.py new file mode 100644 index 0000000000..87eb1f2c35 --- /dev/null +++ b/test_runner/performance/test_compute_ctl_api.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import datetime + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnv + + +@pytest.mark.timeout(120) +def test_compute_ctl_api_latencies( + neon_simple_env: NeonEnv, + zenbenchmark: NeonBenchmarker, +): + """ + Test compute_ctl HTTP API performance. Do simple GET requests + to catch any pathological degradations in the HTTP server. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + client = endpoint.http_client() + + NUM_REQUESTS = 10000 + + status_response_latency_us = [] + metrics_response_latency_us = [] + + for _i in range(NUM_REQUESTS): + start_time = datetime.datetime.now() + _ = client.status() + status_response_latency_us.append((datetime.datetime.now() - start_time).microseconds) + + start_time = datetime.datetime.now() + _ = client.metrics_json() + metrics_response_latency_us.append((datetime.datetime.now() - start_time).microseconds) + + status_response_latency_us = sorted(status_response_latency_us) + metrics_response_latency_us = sorted(metrics_response_latency_us) + + zenbenchmark.record( + "status_response_latency_p50_us", + status_response_latency_us[len(status_response_latency_us) // 2], + "microseconds", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "metrics_response_latency_p50_us", + metrics_response_latency_us[len(metrics_response_latency_us) // 2], + "microseconds", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "status_response_latency_p99_us", + status_response_latency_us[len(status_response_latency_us) * 99 // 100], + "microseconds", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + "metrics_response_latency_p99_us", + metrics_response_latency_us[len(metrics_response_latency_us) * 99 // 100], + "microseconds", + MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py new file mode 100644 index 0000000000..061467bbad --- /dev/null +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -0,0 +1,221 @@ +import math # Add this import +import time +import traceback +from pathlib import Path + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_api import NeonAPI, connection_parameters_to_env +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + +vacuum_times_sql = """ +SELECT + relname AS table_name, + last_autovacuum, + last_autoanalyze +FROM + pg_stat_user_tables where relname = 'pgbench_accounts' +ORDER BY + last_autovacuum DESC, last_autoanalyze DESC +""" + + +def insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series(6800001, {6800001 + rows_to_insert - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is None + + +def insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime +): + cur.execute(f""" + INSERT INTO pgbench_accounts (aid, bid, abalance, filler) + SELECT + aid, + (random() * 10)::int + 1 AS bid, + (random() * 10000)::int AS abalance, + 'filler text' AS filler + FROM generate_series({6800001 + rows_to_insert}, {6800001 + rows_to_insert * 2 - 1}) AS aid; + """) + assert cur.rowcount == rows_to_insert + for _ in range(5): + time.sleep(0.5 * autovacuum_naptime) + cur.execute(vacuum_times_sql) + row = cur.fetchall()[0] + log.info(f"last_autovacuum: {row[1]}, last_autoanalyze: {row[2]}") + assert row[1] is not None + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(60 * 60) +def test_cumulative_statistics_persistence( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Verifies that the cumulative statistics are correctly persisted across restarts. + Cumulative statistics are important to persist across restarts because they are used + when auto-vacuum an auto-analyze trigger conditions are met. + The test performs the following steps: + - Seed a new project using pgbench + - insert tuples that by itself are not enough to trigger auto-vacuum + - suspend the endpoint + - resume the endpoint + - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are + - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension + """ + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + endpoint_id = project["endpoints"][0]["id"] + region_id = project["project"]["region_id"] + log.info(f"Created project {project_id} with endpoint {endpoint_id} in region {region_id}") + error_occurred = False + try: + connstr = project["connection_uris"][0]["connection_uri"] + env = connection_parameters_to_env(project["connection_uris"][0]["connection_parameters"]) + # seed about 1 GiB of data into pgbench_accounts + pg_bin.run_capture(["pgbench", "-i", "-s68"], env=env) + + # assert rows in pgbench_accounts is 6800000 rows + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # assert rows in pgbench_accounts is 6800000 rows + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 # n_tup_ins + assert row[1] == 1 # vacuum_count + assert row[2] == 1 # analyze_count + + # retrieve some GUCs (postgres settings) relevant to autovacuum + cur.execute( + "SELECT setting::int AS autovacuum_naptime FROM pg_settings WHERE name = 'autovacuum_naptime'" + ) + autovacuum_naptime = cur.fetchall()[0][0] + assert autovacuum_naptime < 300 and autovacuum_naptime > 0 + cur.execute( + "SELECT setting::float AS autovacuum_vacuum_insert_scale_factor FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_scale_factor'" + ) + autovacuum_vacuum_insert_scale_factor = cur.fetchall()[0][0] + assert ( + autovacuum_vacuum_insert_scale_factor > 0.05 + and autovacuum_vacuum_insert_scale_factor < 1.0 + ) + cur.execute( + "SELECT setting::int AS autovacuum_vacuum_insert_threshold FROM pg_settings WHERE name = 'autovacuum_vacuum_insert_threshold'" + ) + autovacuum_vacuum_insert_threshold = cur.fetchall()[0][0] + cur.execute( + "SELECT setting::int AS pgstat_file_size_limit FROM pg_settings WHERE name = 'neon.pgstat_file_size_limit'" + ) + pgstat_file_size_limit = cur.fetchall()[0][0] + assert pgstat_file_size_limit > 10 * 1024 # at least 10 MB + + # insert rows that by itself are not enough to trigger auto-vacuum + # vacuum insert threshold = vacuum base insert threshold + vacuum insert scale factor * number of tuples + # https://www.postgresql.org/docs/17/routine-vacuuming.html + rows_to_insert = int( + math.ceil( + autovacuum_vacuum_insert_threshold / 2 + + row_count * autovacuum_vacuum_insert_scale_factor * 0.6 + ) + ) + + log.info( + f"autovacuum_vacuum_insert_scale_factor: {autovacuum_vacuum_insert_scale_factor}, autovacuum_vacuum_insert_threshold: {autovacuum_vacuum_insert_threshold}, row_count: {row_count}" + ) + log.info( + f"Inserting {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + + insert_first_chunk_and_verify_autovacuum_is_not_running( + cur, rows_to_insert, autovacuum_naptime + ) + + conn.close() + + # suspend the endpoint + log.info(f"Suspending endpoint {endpoint_id}") + neon_api.suspend_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + time.sleep(60) # give some time in between suspend and resume + + # resume the endpoint + log.info(f"Starting endpoint {endpoint_id}") + neon_api.start_endpoint(project_id, endpoint_id) + neon_api.wait_for_operation_to_finish(project_id) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + with conn.cursor() as cur: + # insert additional rows that by itself are not enough to trigger auto-vacuum, but in combination + # with the previous rows inserted before the suspension are + log.info( + f"Inserting another {rows_to_insert} rows, which is below the 'vacuum insert threshold'" + ) + insert_second_chunk_and_verify_autovacuum_is_now_running( + cur, rows_to_insert, autovacuum_naptime + ) + + # verify estimatednumber of tuples in pgbench_accounts is within 6800000 + inserted rows +- 2 % + cur.execute( + "select reltuples::bigint from pg_class where relkind = 'r' and relname = 'pgbench_accounts'" + ) + reltuples = cur.fetchall()[0][0] + assert reltuples > 6800000 + rows_to_insert * 2 * 0.98 + assert reltuples < 6800000 + rows_to_insert * 2 * 1.02 + + # verify exact number of pgbench_accounts rows (computed row_count) + cur.execute("select count(*) from pgbench_accounts") + row_count = cur.fetchall()[0][0] + assert row_count == 6800000 + rows_to_insert * 2 + + # verify n_tup_ins, n_live_tup, vacuum_count, analyze_count (manual vacuum and analyze) + cur.execute( + "select n_tup_ins, vacuum_count,analyze_count from pg_stat_user_tables where relname = 'pgbench_accounts'" + ) + row = cur.fetchall()[0] + assert row[0] == 6800000 + rows_to_insert * 2 + assert row[1] == 1 + assert row[2] == 1 + + conn.close() + + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index acb7b56fd0..7c9e9f47c8 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -69,6 +69,9 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma env.create_branch("child") branch_created += 1 + # Ensure L0 layers are compacted so that gc-compaction doesn't get preempted. + client.timeline_checkpoint(tenant_id, timeline_id, force_l0_compaction=True) + max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 for key_range in client.perf_info(tenant_id, timeline_id): diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 2570c55f6c..e2f0a79018 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2( ], ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + n = 100000 step = 5000 # Create many relations diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py new file mode 100644 index 0000000000..ae00dbb3b5 --- /dev/null +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import os +import timeit +from pathlib import Path + +import pytest +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import PgCompare + +from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp + + +def get_custom_scripts( + default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4", +) -> list[str]: + # We parametrize each run with the custom scripts to run and their weights. + # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable. + # Delimit the custom scripts for one run by spaces and for different runs by commas, for example: + # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2" + # Databases/branches are pre-created and passed through BENCHMARK_CONNSTR env variable. + scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default)) + rv = [] + for s in scripts.split(","): + rv.append(s) + return rv + + +def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): + password = env.pg.default_options.get("password", None) + options = env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + # if connstr does not contain pooler we can set statement_timeout to 0 + if "pooler" not in connstr: + options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "") + connstr = env.pg.connstr(password=None, options=options) + + script_args = [ + "pgbench", + "-n", # no explicit vacuum before the test - we want to rely on auto-vacuum + "-M", + "prepared", + "--client=500", + "--jobs=100", + f"-T{duration}", + "-P60", # progress every minute + "--progress-timestamp", + ] + for script in custom_scripts.split(): + script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"]) + script_args.append(connstr) + + run_pgbench( + env, + "custom-scripts", + script_args, + password=password, + ) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, env=environ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) + + +@pytest.mark.parametrize("custom_scripts", get_custom_scripts()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int): + run_test_pgbench(remote_compare, custom_scripts, duration) + # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index d45db28c78..777b9e2870 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -16,6 +16,7 @@ from fixtures.neon_fixtures import ( NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, + StorageControllerMigrationConfig, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion @@ -362,7 +363,10 @@ def test_storage_controller_many_tenants( dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( - env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + env.storage_controller.tenant_shard_migrate, + tenant_shard_id, + dest_ps_id, + StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero diff --git a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json deleted file mode 100644 index af49dfa0c0..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "public_extensions": [ - "anon", - "pg_buffercache" - ], - "library_index": { - "anon": "anon", - "pg_buffercache": "pg_buffercache" - }, - "extension_data": { - "pg_buffercache": { - "control_data": { - "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" - }, - "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" - }, - "anon": { - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" - }, - "archive_path": "5670669815/v14/extensions/anon.tar.zst" - } - } -} \ No newline at end of file diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst deleted file mode 100644 index 5c17630109..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst deleted file mode 100644 index 69648a2f1a..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json deleted file mode 100644 index fd0d1edc3c..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "public_extensions": [ - "anon" - ], - "library_index": { - "anon": "anon" - }, - "extension_data": { - "anon": { - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" - }, - "archive_path": "5670669815/v15/extensions/anon.tar.zst" - } - } -} - diff --git a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst deleted file mode 100644 index ea7034578f..0000000000 Binary files a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst and /dev/null differ diff --git a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json deleted file mode 100644 index 1157e0d032..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "public_extensions": [], - "library_index": { - "TODO": "We still need PG16 extensions" - }, - "extension_data": {} -} \ No newline at end of file diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json deleted file mode 100644 index 7990b2c3a2..0000000000 --- a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "public_extensions": [], - "library_index": { - "TODO": "We still need PG17 extensions" - }, - "extension_data": {} -} \ No newline at end of file diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql new file mode 100644 index 0000000000..1fb183dcae --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0--1.1.sql @@ -0,0 +1,10 @@ +\echo Use "ALTER EXTENSION test_extension UPDATE TO '1.1'" to load this file. \quit + +CREATE FUNCTION test_extension.fun_fact() +RETURNS void +IMMUTABLE LEAKPROOF PARALLEL SAFE +AS $$ +BEGIN + RAISE NOTICE 'Neon has a melting point of -246.08 C'; +END; +$$ LANGUAGE 'plpgsql'; diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql new file mode 100644 index 0000000000..b51e3ed19f --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/sql/test_extension--1.0.sql @@ -0,0 +1,12 @@ +\echo Use "CREATE EXTENSION test_extension" to load this file. \quit + +CREATE SCHEMA test_extension; + +CREATE FUNCTION test_extension.motd() +RETURNS void +IMMUTABLE LEAKPROOF PARALLEL SAFE +AS $$ +BEGIN + RAISE NOTICE 'Have a great day'; +END; +$$ LANGUAGE 'plpgsql'; diff --git a/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control b/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control new file mode 100644 index 0000000000..826f643daf --- /dev/null +++ b/test_runner/regress/data/test_remote_extensions/test_extension/test_extension.control @@ -0,0 +1 @@ +comment = 'Test extension' diff --git a/test_runner/regress/data/test_signed_char.out b/test_runner/regress/data/test_signed_char.out new file mode 100644 index 0000000000..a68876e383 --- /dev/null +++ b/test_runner/regress/data/test_signed_char.out @@ -0,0 +1 @@ +0000000094010815f81f042000000000b89f8000909f5000689f5000489f4000309f3000189f3000009f3000e89e3000d09e3000b89e3000a09e3000889e3000709e3000309e8000189e3000009e3000e89d3000d09d3000b89d3000a09d3000889d3000709d3000589d3000409d3000289d3000109d3000f89c3000e09c3000c89c3000b09c3000989c3000809c3000689c3000509c3000389c3000209c3000089c3000f09b3000d89b3000c09b3000a89b3000909b3000789b3000609b3000489b3000309b3000189b3000009b3000e89a3000d09a3000b89a3000a09a3000889a3000489a8000309a3000189a3000009a3000e8993000d0993000b8993000a09930008899300070993000589930004099300000998000e8983000d0983000b8983000a0983000889830007098300058983000409830002898300010983000f8973000b8978000a09730008897300070973000589730004097300028973000e8968000a89680006896800028968000e8958000a8958000909530005095800038953000209530000895300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000801000010018004c198900000000000000000029000000008010000100180049787f000000000000000000290000000080100001001800727c7000000000000000000029000000008010002800400020766200000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800207262000000000000000000290000000080100028004000766239000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040006239380000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400039383700000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100028004000383736000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040003736350000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400036353400000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800203034000000000000000000280000000080100001001800203933000000000000000000270000000080100001001800203833000000000000000000260000000080100001001800203733000000000000000000250000000080100001001800203633000000000000000000240000000080100001001800203533000000000000000000230000000080100028004000353433000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002034330000000000000000002200000000801000010018002033330000000000000000002100000000801000010018002032330000000000000000002000000000801000010018002031330000000000000000001f00000000801000010018002030330000000000000000001e00000000801000010018002039320000000000000000001d00000000801000010018002038320000000000000000001c00000000801000010018002037320000000000000000001b00000000801000010018002036320000000000000000001a0000000080100001001800203532000000000000000000190000000080100001001800203432000000000000000000180000000080100028004000343332000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002033320000000000000000001700000000801000010018002032320000000000000000001600000000801000010018002031320000000000000000001500000000801000010018002030320000000000000000001400000000801000010018002039310000000000000000001300000000801000010018002038310000000000000000001200000000801000010018002037310000000000000000001100000000801000010018002036310000000000000000001000000000801000010018002035310000000000000000000f00000000801000010018002034310000000000000000000e00000000801000010018002033310000000000000000000d0000000080100028004000333231000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002032310000000000000000000c00000000801000010018002031310000000000000000000b00000000801000010018002030310000000000000000000a00000000801000010018002039200000000000000000000900000000801000010018002038200000000000000000000800000000801000010018002037200000000000000000000700000000801000010018002036200000000000000000000600000000801000010018002035200000000000000000000500000000801000010018003034200000000000000000002800000000801000010018002034200000000000000000000400000000801000010018003933200000000000000000002700000000801000010018003833200000000000000000002600000000801000010018003733200000000000000000002500000000801000010018003633200000000000000000002400000000801000010018003533200000000000000000002300000000801000010018003433200000000000000000002200000000801000010018003333200000000000000000002100000000801000010018003233200000000000000000002000000000801000010018003133200000000000000000001f00000000801000010018003033200000000000000000001e00000000801000010018002033200000000000000000000300000000801000010018003932200000000000000000001d00000000801000010018003832200000000000000000001c00000000801000010018003732200000000000000000001b00000000801000010018003632200000000000000000001a00000000801000010018003532200000000000000000001900000000801000010018003432200000000000000000001800000000801000010018003332200000000000000000001700000000801000010018003232200000000000000000001600000000801000010018003132200000000000000000001500000000801000010018003032200000000000000000001400000000801000010018002032200000000000000000000200000000801000010018003931200000000000000000001300000000801000010018003831200000000000000000001200000000801000010018003731200000000000000000001100000000801000010018003631200000000000000000001000000000801000010018003531200000000000000000000f00000000801000010018003431200000000000000000000e00000000801000010018003331200000000000000000000d0000000080100028004000323120000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018003131200000000000000000000b00000000801000010018003031200000000000000000000a0000000080100001001800203120000000000000000000010000000080100001001800622020000000000000000000290000000080100001001800392020000000000000000000090000000080100001001800382020000000000000000000080000000080100001001800372020000000000000000000070000000080100001001800362020000000000000000000060000000080100001001800352020000000000000000000050000000080100002002000342020000000000000000000040001002400000000000000008010000b00280033202000000000000000000003000a001b010101010101010101000000000000008010000b00280032202000000000000000000002000a001201010101010101010100000000000000801000280040003120200000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100ffffffff00000200 \ No newline at end of file diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index ce8ed3c7c5..0df88e14c2 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -319,8 +319,12 @@ def test_pageserver_gc_compaction_idempotent( }, ) wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + # Ensure all data are uploaded so that the duplicated layer gets into index_part.json + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_flushed=True) if compaction_mode == "after_restart": env.pageserver.restart(True) + workload.validate(env.pageserver.id) ps_http.timeline_gc( tenant_id, timeline_id, None ) # Force refresh gc info to have gc_cutoff generated @@ -335,6 +339,7 @@ def test_pageserver_gc_compaction_idempotent( "sub_compaction_max_job_size_mb": 16, }, ) + workload.validate(env.pageserver.id) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 3a08671bbf..2e7da86d9d 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -5,34 +5,59 @@ import logging import requests from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +TEST_ROLE_NAMES = [ + {"name": "neondb_owner"}, + {"name": "role with spaces"}, + {"name": "role with%20spaces "}, + {"name": "role with whitespaces "}, + {"name": "injective role with spaces'; SELECT pg_sleep(1000);"}, + {"name": "role with #pound-sign and &ersands=true"}, + {"name": "role with emoji ๐ŸŒ"}, + {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"}, + {"name": '"role in double quotes"'}, + {"name": "'role in single quotes'"}, +] + TEST_DB_NAMES = [ { "name": "neondb", - "owner": "cloud_admin", + "owner": "neondb_owner", }, { "name": "db with spaces", - "owner": "cloud_admin", + "owner": "role with spaces", }, { "name": "db with%20spaces ", - "owner": "cloud_admin", + "owner": "role with%20spaces ", }, { "name": "db with whitespaces ", - "owner": "cloud_admin", + "owner": "role with whitespaces ", }, { - "name": "injective db with spaces'; SELECT pg_sleep(10);", - "owner": "cloud_admin", + "name": "injective db with spaces'; SELECT pg_sleep(1000);", + "owner": "injective role with spaces'; SELECT pg_sleep(1000);", }, { "name": "db with #pound-sign and &ersands=true", - "owner": "cloud_admin", + "owner": "role with #pound-sign and &ersands=true", }, { "name": "db with emoji ๐ŸŒ", - "owner": "cloud_admin", + "owner": "role with emoji ๐ŸŒ", + }, + { + "name": "db \";with ';injections $$ $x$ $ %I !/\\&#@", + "owner": "role \";with ';injections $$ $x$ $ %I !/\\&#@", + }, + { + "name": '"db in double quotes"', + "owner": '"role in double quotes"', + }, + { + "name": "'db in single quotes'", + "owner": "'role in single quotes'", }, ] @@ -52,6 +77,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -99,10 +125,10 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ), f"Expected 404 status code, but got {e.response.status_code}" -def test_compute_create_databases(neon_simple_env: NeonEnv): +def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): """ - Test that compute_ctl can create and work with databases with special - characters (whitespaces, %, tabs, etc.) in the name. + Test that compute_ctl can create and work with databases and roles + with special characters (whitespaces, %, tabs, etc.) in the name. """ env = neon_simple_env @@ -116,6 +142,7 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): **{ "skip_pg_catalog_updates": False, "cluster": { + "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, }, } @@ -139,6 +166,43 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): assert len(curr_db) == 1 assert curr_db[0] == db["name"] + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is not None + assert catalog_role[0] == role["name"] + + delta_operations = [] + for db in TEST_DB_NAMES: + delta_operations.append({"action": "delete_db", "name": db["name"]}) + for role in TEST_ROLE_NAMES: + delta_operations.append({"action": "delete_role", "name": role["name"]}) + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [], + "databases": [], + }, + "delta_operations": delta_operations, + } + ) + endpoint.reconfigure() + + for db in TEST_DB_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],)) + catalog_db = cursor.fetchone() + assert catalog_db is None + + for role in TEST_ROLE_NAMES: + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],)) + catalog_role = cursor.fetchone() + assert catalog_role is None + def test_dropdb_with_subscription(neon_simple_env: NeonEnv): """ @@ -150,17 +214,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # stuff into the spec.json file. endpoint = env.endpoints.create_start("main") + SUB_DB_NAME = "';subscriber_db $$ $x$ $;" + PUB_DB_NAME = "publisher_db" TEST_DB_NAMES = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "subscriber_db", + "name": SUB_DB_NAME, "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -177,47 +243,47 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): ) endpoint.reconfigure() - # connect to the publisher_db and create a publication - with endpoint.cursor(dbname="publisher_db") as cursor: + # Connect to the PUB_DB_NAME and create a publication + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES") cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") cursor.execute("CREATE TABLE t(a int)") cursor.execute("INSERT INTO t VALUES (1)") cursor.execute("CHECKPOINT") - # connect to the subscriber_db and create a subscription - # Note that we need to create subscription with - connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") - with endpoint.cursor(dbname="subscriber_db") as cursor: + # Connect to the SUB_DB_NAME and create a subscription + # Note that we need to create subscription with the following connstr: + connstr = endpoint.connstr(dbname=PUB_DB_NAME).replace("'", "''") + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("CREATE TABLE t(a int)") cursor.execute( - f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " + f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " ) - # wait for the subscription to be active + # Wait for the subscription to be active logical_replication_sync( endpoint, endpoint, "mysub", - sub_dbname="subscriber_db", - pub_dbname="publisher_db", + sub_dbname=SUB_DB_NAME, + pub_dbname=PUB_DB_NAME, ) # Check that replication is working - with endpoint.cursor(dbname="subscriber_db") as cursor: + with endpoint.cursor(dbname=SUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM t") rows = cursor.fetchall() assert len(rows) == 1 assert rows[0][0] == 1 - # drop the subscriber_db from the list + # Drop the SUB_DB_NAME from the list TEST_DB_NAMES_NEW = [ { "name": "neondb", "owner": "cloud_admin", }, { - "name": "publisher_db", + "name": PUB_DB_NAME, "owner": "cloud_admin", }, ] @@ -230,7 +296,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): "databases": TEST_DB_NAMES_NEW, }, "delta_operations": [ - {"action": "delete_db", "name": "subscriber_db"}, + {"action": "delete_db", "name": SUB_DB_NAME}, # also test the case when we try to delete a non-existent database # shouldn't happen in normal operation, # but can occur when failed operations are retried @@ -239,32 +305,35 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): } ) - logging.info("Reconfiguring the endpoint to drop the subscriber_db") + logging.info(f"Reconfiguring the endpoint to drop the {SUB_DB_NAME} database") endpoint.reconfigure() - # Check that the subscriber_db is dropped + # Check that the SUB_DB_NAME is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",)) + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (SUB_DB_NAME,)) catalog_db = cursor.fetchone() assert catalog_db is None - # Check that we can still connect to the publisher_db - with endpoint.cursor(dbname="publisher_db") as cursor: + # Check that we can still connect to the PUB_DB_NAME + with endpoint.cursor(dbname=PUB_DB_NAME) as cursor: cursor.execute("SELECT * FROM current_database()") curr_db = cursor.fetchone() assert curr_db is not None assert len(curr_db) == 1 - assert curr_db[0] == "publisher_db" + assert curr_db[0] == PUB_DB_NAME -def test_compute_drop_role(neon_simple_env: NeonEnv): +def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: NeonEnv): """ Test that compute_ctl can drop a role even if it has some depending objects - like permissions in one of the databases. + like permissions in one of the databases that were granted by + neon_superuser. + Reproduction test for https://github.com/neondatabase/cloud/issues/13582 """ env = neon_simple_env TEST_DB_NAME = "db_with_permissions" + TEST_GRANTEE = "'); MALFORMED SQL $$ $x$ $/;5%$ %I" endpoint = env.endpoints.create_start("main") @@ -301,16 +370,18 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): cursor.execute("create view test_view as select * from test_table") with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor: - cursor.execute("create role readonly") + cursor.execute(f'create role "{TEST_GRANTEE}"') # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database. # Postgres has all sorts of permissions and grants that we may not handle well, # but this is the shortest repro grant for the issue # https://github.com/neondatabase/cloud/issues/13582 - cursor.execute("grant select on all tables in schema public to readonly") + cursor.execute(f'grant select on all tables in schema public to "{TEST_GRANTEE}"') # Check that role was created with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is not None @@ -318,7 +389,8 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # that may block our ability to drop the role. with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: cursor.execute( - "select grantor from information_schema.role_table_grants where grantee = 'readonly'" + "select grantor from information_schema.role_table_grants where grantee = %(grantee)s", + {"grantee": TEST_GRANTEE}, ) res = cursor.fetchall() assert len(res) == 2, f"Expected 2 table grants, got {len(res)}" @@ -332,7 +404,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): "delta_operations": [ { "action": "delete_role", - "name": "readonly", + "name": TEST_GRANTEE, }, ], } @@ -341,7 +413,9 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): # Check that role is dropped with endpoint.cursor() as cursor: - cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) role = cursor.fetchone() assert role is None @@ -370,3 +444,68 @@ def test_compute_drop_role(neon_simple_env: NeonEnv): cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly2'") role = cursor.fetchone() assert role is None + + +def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a role if the role has previously been + granted table privileges by a role other than neon_superuser. + """ + TEST_DB_NAME = "neondb" + TEST_GRANTOR = "; RAISE EXCEPTION 'SQL injection detected;" + TEST_GRANTEE = "'$$; RAISE EXCEPTION 'SQL injection detected;'" + + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": TEST_GRANTOR, + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": TEST_GRANTOR, + }, + ], + }, + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB_NAME, user=TEST_GRANTOR) as cursor: + cursor.execute(f'CREATE USER "{TEST_GRANTEE}"') + cursor.execute("CREATE TABLE test_table(id bigint)") + cursor.execute(f'GRANT ALL ON TABLE test_table TO "{TEST_GRANTEE}"') + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + } + ) + endpoint.reconfigure() + + with endpoint.cursor() as cursor: + cursor.execute( + "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE} + ) + role = cursor.fetchone() + assert role is None diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 7f12c14073..30f8c65cbd 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -2,26 +2,26 @@ from __future__ import annotations import os import shutil -from contextlib import closing +import tarfile from pathlib import Path from typing import TYPE_CHECKING import pytest +import zstandard from fixtures.log_helper import log from fixtures.metrics import parse_metrics -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) -from fixtures.pg_version import PgVersion -from fixtures.utils import skip_on_postgres from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any from fixtures.httpserver import ListenAddress + from fixtures.neon_fixtures import ( + NeonEnvBuilder, + ) + from fixtures.pg_version import PgVersion + from werkzeug.wrappers.request import Request # use neon_env_builder_local fixture to override the default neon_env_builder fixture @@ -31,13 +31,13 @@ def neon_env_builder_local( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_distrib_dir: Path, - pg_version: PgVersion, ) -> NeonEnvBuilder: test_local_pginstall = test_output_dir / "pg_install" log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}") - shutil.copytree( - pg_distrib_dir / pg_version.v_prefixed, test_local_pginstall / pg_version.v_prefixed - ) + + # We can't copy only the version that we are currently testing because other + # binaries like the storage controller need specific Postgres versions. + shutil.copytree(pg_distrib_dir, test_local_pginstall) neon_env_builder.pg_distrib_dir = test_local_pginstall log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}") @@ -45,89 +45,92 @@ def neon_env_builder_local( return neon_env_builder -@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") -@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building") def test_remote_extensions( httpserver: HTTPServer, neon_env_builder_local: NeonEnvBuilder, httpserver_listen_address: ListenAddress, + test_output_dir: Path, + base_dir: Path, pg_version: PgVersion, ): - # setup mock http server - # that expects request for anon.tar.zst - # and returns the requested file + # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" build_tag = os.environ.get("BUILD_TAG", "latest") - archive_path = f"{build_tag}/v{pg_version}/extensions/anon.tar.zst" + archive_route = f"{build_tag}/v{pg_version}/extensions/test_extension.tar.zst" + tarball = test_output_dir / "test_extension.tar" + extension_dir = ( + base_dir / "test_runner" / "regress" / "data" / "test_remote_extensions" / "test_extension" + ) - def endpoint_handler_build_tag(request: Request) -> Response: + # Create tarball + with tarfile.open(tarball, "x") as tarf: + tarf.add( + extension_dir / "sql" / "test_extension--1.0.sql", + arcname="share/extension/test_extension--1.0.sql", + ) + tarf.add( + extension_dir / "sql" / "test_extension--1.0--1.1.sql", + arcname="share/extension/test_extension--1.0--1.1.sql", + ) + + def handler(request: Request) -> Response: log.info(f"request: {request}") - file_name = "anon.tar.zst" - file_path = f"test_runner/regress/data/extension_test/5670669815/v{pg_version}/extensions/anon.tar.zst" - file_size = os.path.getsize(file_path) - fh = open(file_path, "rb") + # Compress tarball + compressor = zstandard.ZstdCompressor() + with open(tarball, "rb") as f: + compressed_data = compressor.compress(f.read()) return Response( - fh, + compressed_data, mimetype="application/octet-stream", headers=[ - ("Content-Length", str(file_size)), - ("Content-Disposition", f'attachment; filename="{file_name}"'), + ("Content-Length", str(len(compressed_data))), ], direct_passthrough=True, ) httpserver.expect_request( - f"/pg-ext-s3-gateway/{archive_path}", method="GET" - ).respond_with_handler(endpoint_handler_build_tag) + f"/pg-ext-s3-gateway/{archive_route}", method="GET" + ).respond_with_handler(handler) # Start a compute node with remote_extension spec # and check that it can download the extensions and use them to CREATE EXTENSION. env = neon_env_builder_local.init_start() env.create_branch("test_remote_extensions") - endpoint = env.endpoints.create( - "test_remote_extensions", - config_lines=["log_min_messages=debug3"], - ) + endpoint = env.endpoints.create("test_remote_extensions") + + with open(extension_dir / "test_extension.control", encoding="utf-8") as f: + control_data = f.read() # mock remote_extensions spec spec: dict[str, Any] = { - "public_extensions": ["anon"], + "public_extensions": ["test_extension"], "custom_extensions": None, "library_index": { - "anon": "anon", + "test_extension": "test_extension", }, "extension_data": { - "anon": { + "test_extension": { "archive_path": "", "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = 'Data anonymization tools'\ndefault_version = '1.1.0'\ndirectory='extension/anon'\nrelocatable = false\nrequires = 'pgcrypto'\nsuperuser = false\nmodule_pathname = '$libdir/anon'\ntrusted = true\n" + "test_extension.control": control_data, }, }, }, } - spec["extension_data"]["anon"]["archive_path"] = archive_path endpoint.create_remote_extension_spec(spec) - endpoint.start( - remote_ext_config=extensions_endpoint, - ) + endpoint.start(remote_ext_config=extensions_endpoint) - # this is expected to fail if there's no pgcrypto extension, that's ok - # we just want to check that the extension was downloaded - try: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - # Check that appropriate files were downloaded - cur.execute("CREATE EXTENSION anon") - res = [x[0] for x in cur.fetchall()] - log.info(res) - except Exception as err: - assert "pgcrypto" in str(err), f"unexpected error creating anon extension {err}" + with endpoint.connect() as conn: + with conn.cursor() as cur: + # Check that appropriate files were downloaded + cur.execute("CREATE EXTENSION test_extension VERSION '1.0'") + cur.execute("SELECT test_extension.motd()") httpserver.check() @@ -137,6 +140,48 @@ def test_remote_extensions( metrics = parse_metrics(raw_metrics) remote_ext_requests = metrics.query_all( "compute_ctl_remote_ext_requests_total", + # Check that we properly report the filename in the metrics + {"filename": "test_extension.tar.zst"}, + ) + assert len(remote_ext_requests) == 1 + for sample in remote_ext_requests: + assert sample.value == 1 + + endpoint.stop() + + # Remove the extension files to force a redownload of the extension. + for file in ( + "test_extension.control", + "test_extension--1.0.sql", + "test_extension--1.0--1.1.sql", + ): + ( + test_output_dir + / "pg_install" + / f"v{pg_version}" + / "share" + / "postgresql" + / "extension" + / file + ).unlink() + + endpoint.start(remote_ext_config=extensions_endpoint) + + # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. + with endpoint.connect() as conn: + with conn.cursor() as cur: + # Check that appropriate files were downloaded + cur.execute("ALTER EXTENSION test_extension UPDATE TO '1.1'") + cur.execute("SELECT test_extension.fun_fact()") + + # Check that we properly recorded downloads in the metrics + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + remote_ext_requests = metrics.query_all( + "compute_ctl_remote_ext_requests_total", + # Check that we properly report the filename in the metrics + {"filename": "test_extension.tar.zst"}, ) assert len(remote_ext_requests) == 1 for sample in remote_ext_requests: diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 55fd7a8608..17ffeca23b 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until def check_client(env: NeonEnv, client: PageserverHttpClient): @@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde with env.pageserver.http_client(auth_token=pageserver_token) as client: check_client(env, client) + + +@run_only_on_default_postgres("it does not use any postgres functionality") +def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + with env.pageserver.http_client() as client: + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "migrating"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating" + # This is invalid in practice: we should never rollback the migrating state to legacy. + # But we do it here to test the API. + client.timeline_patch_index_part( + tenant_id, + timeline_id, + {"rel_size_migration": "legacy"}, + ) + assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy" diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a9b897b741..130db009c9 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -938,9 +938,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Expect lots of layers assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 - # Simulate large data by making layer downloads artifically slow for ps in env.pageservers: + # Simulate large data by making layer downloads artifically slow ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + # Make the initial logical size calculation lie. Otherwise it on demand downloads + # layers and makes accounting difficult. + ps.http_client().configure_failpoints(("skip-logical-size-calculation", "return")) def timeline_heatmap(tlid): assert env.pageserver_remote_storage is not None @@ -952,20 +955,16 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): raise RuntimeError(f"No heatmap for timeline: {tlid}") - # Upload a heatmap, so that secondaries have something to download - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - heatmap_before_migration = timeline_heatmap(timeline_id) + def count_timeline_heatmap_layers(tlid) -> tuple[int, int]: + cold, hot = 0, 0 + layers = timeline_heatmap(tlid)["layers"] + for layer in layers: + if layer["cold"]: + cold += 1 + else: + hot += 1 - # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. - # However, it pulls the heatmap, which will be important later. - http_client = env.storage_controller.pageserver_api() - (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) - assert status == 202 - assert progress["heatmap_mtime"] is not None - assert progress["layers_downloaded"] > 0 - assert progress["bytes_downloaded"] > 0 - assert progress["layers_total"] > progress["layers_downloaded"] - assert progress["bytes_total"] > progress["bytes_downloaded"] + return cold, hot env.storage_controller.allowed_errors.extend( [ @@ -975,8 +974,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Use a custom configuration that gives up earlier than usual. # We can't hydrate everything anyway because of the failpoints. + # Implicitly, this also uploads a heatmap from the current attached location. config = StorageControllerMigrationConfig( - secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" + secondary_warmup_timeout="5s", secondary_download_request_timeout="2s", prewarm=False ) env.storage_controller.tenant_shard_migrate( TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config @@ -988,31 +988,33 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_heatmap_upload(tenant_id) heatmap_after_migration = timeline_heatmap(timeline_id) - assert len(heatmap_before_migration["layers"]) > 0 + local_layers = ps_secondary.list_layers(tenant_id, timeline_id) + # We download 1 layer per second and give up within 5 seconds. + assert len(local_layers) < 10 after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) - assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count - log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") env.storage_controller.download_heatmap_layers( TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - # Now simulate the case where a child timeline is archived, parent layers - # are evicted and the child is unarchived. When the child is unarchived, - # itself and the parent update their heatmaps to contain layers needed by the - # child. One can warm up the timeline hierarchy since the heatmaps are ready. - - def all_layers_downloaded(expected_layer_count: int): - local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + def all_layers_downloaded(node, expected_layer_count: int): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") assert local_layers_count >= expected_layer_count - wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) - ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + def no_layers_downloaded(node): + local_layers_count = len(node.list_layers(tenant_id, timeline_id)) + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == 0 + + wait_until(lambda: all_layers_downloaded(ps_secondary, after_migration_heatmap_layers_count)) + + # Read everything and make sure that we're not downloading anything extra. + # All hot layers should be available locally now. before = ( ps_secondary.http_client() .get_metrics() @@ -1030,6 +1032,11 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): workload.stop() assert before == after + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + def check_archival_state(state: TimelineArchivalState, tline): timelines = ( timeline["timeline_id"] @@ -1057,13 +1064,35 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) ps_secondary.http_client().tenant_heatmap_upload(tenant_id) - log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") - log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") - expected_locally = len(timeline_heatmap(timeline_id)["layers"]) - assert expected_locally > 0 + parent_cold, parent_hot = count_timeline_heatmap_layers(timeline_id) + child_cold, child_hot = count_timeline_heatmap_layers(child_timeline_id) + + log.info(f"Parent timeline heatmap size: cold={parent_cold}, hot={parent_hot}") + log.info(f"Child timeline heatmap size: cold={child_cold}, hot={child_hot}") + + # All layers in the heatmap should come from the generation on unarchival. + # Hence, they should be cold. + assert parent_cold > 0 + assert parent_hot == 0 + + expected_locally = parent_cold env.storage_controller.download_heatmap_layers( - TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True ) - wait_until(lambda: all_layers_downloaded(expected_locally)) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) + + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + + # The uploaded heatmap is still empty. Clean up all layers on the secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: no_layers_downloaded(ps_attached)) + + # Upload a new heatmap. The previously cold layers become hot since they're now resident. + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + # Warm up the current secondary. + ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) + wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index afc7ef3e01..1d9f385358 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -5,7 +5,7 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, cast import pytest from fixtures.log_helper import log @@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End pageserver.http_client().timeline_gc(shard, env.initial_timeline, None) +def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]: + tenant_conf = tenant_conf.copy() + if reldir_type == "v2": + tenant_conf["rel_size_v2_enabled"] = "true" + else: + tenant_conf["rel_size_v2_enabled"] = "false" + return tenant_conf + + # Run the main PostgreSQL regression tests, in src/test/regress. # @pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -130,6 +140,7 @@ def test_pg_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -142,7 +153,7 @@ def test_pg_regress( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), initial_tenant_shard_count=shard_count, ) @@ -196,6 +207,7 @@ def test_pg_regress( # @pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_isolation( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -204,6 +216,7 @@ def test_isolation( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "isolation_regression" @@ -211,7 +224,8 @@ def test_isolation( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -267,6 +281,7 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_sql_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -275,6 +290,7 @@ def test_sql_regress( base_dir: Path, pg_distrib_dir: Path, shard_count: int | None, + reldir_type: str, ): DBNAME = "regression" @@ -282,7 +298,8 @@ def test_sql_regress( neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) env = neon_env_builder.init_start( - initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type), + initial_tenant_shard_count=shard_count, ) # Connect to postgres and create a database called "regression". @@ -345,9 +362,7 @@ def test_tx_abort_with_many_relations( """ env = neon_env_builder.init_start( - initial_tenant_conf={ - "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false", - } + initial_tenant_conf=patch_tenant_conf({}, reldir_type), ) ep = env.endpoints.create_start( "main", @@ -358,14 +373,25 @@ def test_tx_abort_with_many_relations( ], ) + if reldir_type == "v1": + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + else: + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - if reldir_type == "v1": - n = 4000 - step = 4000 - else: - n = 100000 - step = 5000 + n = 5000 + step = 2500 def create(): # Create many relations diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py index 3e29c92a96..07eacfc775 100644 --- a/test_runner/regress/test_relations.py +++ b/test_runner/regress/test_relations.py @@ -19,6 +19,17 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + + # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do + # a "wait_flush_lsn" here, but it's easier to just do a restart. + env.pageserver.restart() + # Switch to v2 env.pageserver.http_client().update_tenant_config( env.initial_tenant, @@ -27,6 +38,13 @@ def test_pageserver_reldir_v2( }, ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo1") endpoint.safe_psql("SELECT * FROM foo2") @@ -41,12 +59,14 @@ def test_pageserver_reldir_v2( # Create a relation in v2 endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)") # Delete a relation in v1 endpoint.safe_psql("DROP TABLE foo1") # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") + endpoint.safe_psql("SELECT * FROM foo4") # Restart the endpoint endpoint.stop() @@ -57,7 +77,7 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") - + endpoint.safe_psql("SELECT * FROM foo4") endpoint.safe_psql("DROP TABLE foo3") endpoint.stop() endpoint.start() @@ -66,3 +86,25 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("DROP TABLE IF EXISTS foo3") + endpoint.safe_psql("SELECT * FROM foo4") + + # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached. + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": False, + }, + ) + + # Check if the relation is still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo4") + + env.pageserver.restart() + + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "migrating" + ) diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py new file mode 100644 index 0000000000..b46095d583 --- /dev/null +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import threading +import time +from contextlib import closing +from enum import StrEnum + +import pytest +import requests +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnvBuilder, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.safekeeper_utils import is_segment_offloaded +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + # FIXME: are these expected? + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant + timeline_id_1 = env.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant + + tenant_id_other, timeline_id_other = env.create_tenant() + + # Populate branches + endpoint_1 = env.endpoints.create_start("br1") + endpoint_2 = env.endpoints.create_start("br2") + endpoint_3 = env.endpoints.create_start("br3") + endpoint_4 = env.endpoints.create_start("br4") + endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) + for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key)") + sk = env.safekeepers[0] + sk_data_dir = sk.data_dir + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + endpoint_2.stop_and_destroy() + endpoint_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for endpoint in [endpoint_1, endpoint_3, endpoint_other]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (1)") + + # Stop all computes gracefully before safekeepers stop responding to them + endpoint_1.stop_and_destroy() + endpoint_3.stop_and_destroy() + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure repeated deletion succeeds + assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove non-existing branch, should succeed + assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response[str(timeline_id_3)]["dir_existed"] + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + # assert response == {} + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure the other tenant still works + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) + with closing(endpoint_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (123)") + + +def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): + """ + Test deleting timelines on a safekeeper while they're under load. + + This should not happen under normal operation, but it can happen if + there is some rogue compute/pageserver that is writing/reading to a + safekeeper that we're migrating a timeline away from, or if the timeline + is being deleted while such a rogue client is running. + """ + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + # Create two endpoints that will generate load + timeline_id_a = env.create_branch("deleteme_a") + timeline_id_b = env.create_branch("deleteme_b") + + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + endpoint_b = env.endpoints.create("deleteme_b") + endpoint_b.start() + + # Get tenant and timeline IDs + tenant_id = env.initial_tenant + + # Start generating load on both timelines + def generate_load(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + while True: + try: + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") + except: # noqa + # Ignore errors since timeline may be deleted + break + + t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) + t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) + try: + t_a.start() + t_b.start() + + # Let the load run for a bit + log.info("Warming up...") + time.sleep(2) + + # Safekeeper errors will propagate to the pageserver: it is correct that these are + # logged at error severity because they indicate the pageserver is trying to read + # a timeline that it shouldn't. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline.*was cancelled.*", + ".*Timeline.*was not found.*", + ] + ) + + # Try deleting timelines while under load + sk = env.safekeepers[0] + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + + # Delete first timeline + log.info(f"Deleting {timeline_id_a}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] + + # Delete second timeline + log.info(f"Deleting {timeline_id_b}...") + assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] + + # Verify timelines are gone from disk + sk_data_dir = sk.data_dir + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() + # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() + + finally: + log.info("Stopping endpoints...") + # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang + endpoint_a.stop(mode="immediate") + endpoint_b.stop(mode="immediate") + log.info("Joining threads...") + t_a.join() + t_b.join() + + +class RemoteDeleteFailpoint(StrEnum): + PAUSE = "sk-delete-timeline-remote-pause" + FAIL = "sk-delete-timeline-remote" + + +@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL]) +def test_safekeeper_delete_remote_errors( + neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint +): + """ + Test that errors and delays during remote deletion are handled correctly. + """ + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--control-file-save-interval", + "1s", + ] + neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + env = neon_env_builder.init_start() + + # FIXME: pageserver is intermittently emitting this + env.pageserver.allowed_errors.extend( + [ + ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*", + ] + ) + + timeline_id_a = env.create_branch("deleteme_a") + endpoint_a = env.endpoints.create("deleteme_a") + endpoint_a.start() + with closing(endpoint_a.connect()) as conn: + with conn.cursor() as cur: + # roughly fills one segment + cur.execute("create table t(key int, value text)") + cur.execute("insert into t select generate_series(1,250000), 'payload'") + endpoint_a.stop() + + # Ensure something is uploaded to remote storage + def assert_is_uploaded(): + assert is_segment_offloaded( + env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000") + ) + + wait_until(assert_is_uploaded) + + def list_timeline_remote(): + assert isinstance(env.safekeepers_remote_storage, S3Storage) + prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/" + + listing = env.safekeepers_remote_storage.client.list_objects_v2( + Bucket=env.safekeepers_remote_storage.bucket_name, + Prefix=prefix, + ) + return listing.get("Contents", []) + + assert list_timeline_remote() != [] + + sk_http = env.safekeepers[0].http_client() + env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a) + + # Set up failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + sk_http.configure_failpoints((failpoint, "pause")) + elif failpoint == RemoteDeleteFailpoint.FAIL: + sk_http.configure_failpoints((failpoint, "return")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Delete the timeline - this should hit the configured failpoint + if failpoint == RemoteDeleteFailpoint.PAUSE: + # Expect time out + with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5) + + # Assert deletion didn't happy yet + assert list_timeline_remote() != [] + + # Unblock the background task that should still be running + sk_http.configure_failpoints((failpoint, "off")) + + # Expect that after unblocking, remote deletion proceeds + def assert_remote_deleted(): + assert list_timeline_remote() == [] + + wait_until(assert_remote_deleted) + + elif failpoint == RemoteDeleteFailpoint.FAIL: + # Expect immediate failure + with pytest.raises(sk_http.HTTPError, match="Internal Server Error"): + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + sk_http.configure_failpoints((failpoint, "off")) + else: + raise NotImplementedError(f"Unknown failpoint: {failpoint}") + + # Retry should succeed + sk_http.timeline_delete(env.initial_tenant, timeline_id_a) + + # Remote storage should be empty + assert list_timeline_remote() == [] diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index f58bbcd3c0..cb28f5b12d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1814,14 +1814,3 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn - - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) diff --git a/test_runner/regress/test_signed_char.py b/test_runner/regress/test_signed_char.py new file mode 100644 index 0000000000..8752a1ff3f --- /dev/null +++ b/test_runner/regress/test_signed_char.py @@ -0,0 +1,64 @@ +from pathlib import Path + +from fixtures.neon_fixtures import NeonEnv + +SIGNED_CHAR_EXTRACT = """ + WITH + -- Generates an intermediate table with block numbers of the index + pagenumbers AS ( + SELECT num FROM generate_series(0, (pg_relation_size('test_payload_idx') / 8192) - 1) it(num) + ) + SELECT num, + -- Gets the data of the page, skipping the first 8 bytes which is the LSN + substr(page, 9, 8192-8), + -- Returns information about the GIN index opaque area + (gin_page_opaque_info(page)).* + FROM pagenumbers, + -- Gets a page from the respective blocks of the table + LATERAL (SELECT get_raw_page('test_payload_idx', num)) AS p(page) + -- Filters to only return leaf pages from the GIN Index + WHERE ARRAY['leaf'] = ((gin_page_opaque_info(page)).flags); + """ + + +def test_signed_char(neon_simple_env: NeonEnv): + """ + Test that postgres was compiled with -fsigned-char. + --- + In multi-character keys, the GIN index creates a CRC Hash of the first 3 bytes of the key. + The hash can have the first bit to be set or unset, needing to have a consistent representation + of char across architectures for consistent results. GIN stores these keys by their hashes + which determines the order in which the keys are obtained from the GIN index. + Using -fsigned-char enforces this order across platforms making this consistent. + The following query gets all the data present in the leaf page of a GIN index, + which is ordered by the CRC hash and is consistent across platforms. + """ + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + with endpoint.connect().cursor() as ses1: + # Add the required extensions + ses1.execute("CREATE EXTENSION pg_trgm;") + ses1.execute("CREATE EXTENSION pageinspect;") + # Create a test table + ses1.execute("CREATE TABLE test (payload text);") + # Create a GIN based index + ses1.execute( + "CREATE INDEX test_payload_idx ON test USING gin (payload gin_trgm_ops) WITH (gin_pending_list_limit = 64);" + ) + # insert a multibyte character to trigger order-dependent hashing + ses1.execute( + "INSERT INTO test SELECT '123456789BV' || CHR(127153) /* ace of spades, a multibyte character */ || i::text from generate_series(1, 40) as i(i);" + ) + ses1.execute("INSERT INTO test SELECT 'Bรณbr';") + # Clean pending list to flush data to pages + ses1.execute("select gin_clean_pending_list('test_payload_idx'::regclass);") + ses1.execute(SIGNED_CHAR_EXTRACT) + pages = ses1.fetchall() + # Compare expected output + page1 = pages[0] + data = bytes(page1[1]).hex() + with open(Path(__file__).parent / "data" / "test_signed_char.out", encoding="utf-8") as f: + expected = f.read().rstrip() + + assert data == expected diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py new file mode 100644 index 0000000000..25d839aa42 --- /dev/null +++ b/test_runner/regress/test_ssl.py @@ -0,0 +1,15 @@ +import requests +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS pageserver management API. + If NeonEnv starts with use_https_pageserver_api with no errors, it's already a success. + Make /v1/status request to HTTPS API to ensure it's appropriately configured. + """ + neon_env_builder.use_https_pageserver_api = True + env = neon_env_builder.init_start() + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index d5acc257b2..29919f2fe7 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -26,6 +26,7 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, StorageControllerLeadershipStatus, + StorageControllerMigrationConfig, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -765,7 +766,10 @@ def test_storage_controller_stuck_compute_hook( # status is cleared. handle_params["status"] = 423 migrate_fut = executor.submit( - env.storage_controller.tenant_shard_migrate, shard_0_id, dest_ps_id + env.storage_controller.tenant_shard_migrate, + shard_0_id, + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) def logged_stuck(): @@ -793,7 +797,10 @@ def test_storage_controller_stuck_compute_hook( # Now, do a migration in the opposite direction handle_params["status"] = 423 migrate_fut = executor.submit( - env.storage_controller.tenant_shard_migrate, shard_0_id, origin_pageserver.id + env.storage_controller.tenant_shard_migrate, + shard_0_id, + origin_pageserver.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), ) def logged_stuck_again(): @@ -1027,7 +1034,11 @@ def test_storage_controller_compute_hook_revert( with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"): # We expect the controller to give us an error because its reconciliation timed out # waiting for the compute hook. - env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id) + env.storage_controller.tenant_shard_migrate( + tenant_shard_id, + pageserver_b.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) # Although the migration API failed, the hook should still see pageserver B (it remembers what # was posted even when returning an error code) @@ -1068,7 +1079,11 @@ def test_storage_controller_compute_hook_revert( # Migrate B -> A, with a working compute hook: the controller should notify the hook because the # last update it made that was acked (423) by the compute was for node B. handle_params["status"] = 200 - env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) + env.storage_controller.tenant_shard_migrate( + tenant_shard_id, + pageserver_a.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) wait_until(lambda: notified_ps(pageserver_a.id)) @@ -1949,6 +1964,9 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"] ) ), + # A simple migration where we will ignore scheduling (force=true) and do it immediately (prewarm=false) + "--prewarm=false", + "--override-scheduler=true", ] ) @@ -3208,6 +3226,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): "host": "localhost", "port": sk_0.port.pg, "http_port": sk_0.port.http, + "https_port": None, "version": 5957, "availability_zone_id": "us-east-2b", } @@ -3242,6 +3261,24 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert eq_safekeeper_records(body, inserted_now) + # https_port appears during migration + body["https_port"] = 123 + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_now] + assert inserted_now is not None + assert eq_safekeeper_records(body, inserted_now) + env.storage_controller.consistency_check() + + # https_port rollback + body["https_port"] = None + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_now] + assert inserted_now is not None + assert eq_safekeeper_records(body, inserted_now) + env.storage_controller.consistency_check() + # some small tests for the scheduling policy querying and returning APIs newest_info = target.get_safekeeper(inserted["id"]) assert newest_info @@ -3774,6 +3811,7 @@ def test_storage_controller_node_flap_detach_race( wait_until(validate_locations, timeout=10) +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): """ Check that storage controller handles node_register requests with updated fields correctly. @@ -3865,3 +3903,108 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB ) assert reconciles_after_restart == 0 + + +@pytest.mark.parametrize("wrong_az", [True, False]) +def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool): + """ + Test that the graceful migration API goes through the process of + creating a secondary & waiting for it to warm up before cutting over, when + we use the prewarm=True flag to the API. + """ + + # 2 pageservers in 2 AZs, so that each AZ has a pageserver we can migrate to + neon_env_builder.num_pageservers = 4 + neon_env_builder.num_azs = 2 + + env = neon_env_builder.init_start() + + # Enable secondary location (neon_local disables by default) + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + initial_ps_id = initial_desc["node_attached"] + initial_secondary_id = initial_desc["node_secondary"][0] + initial_ps_az = initial_desc["preferred_az_id"] + initial_ps = [ps for ps in env.pageservers if ps.id == initial_ps_id][0] + + if wrong_az: + dest_ps = [ + ps + for ps in env.pageservers + if ps.id != initial_ps_id + and ps.az_id != initial_ps_az + and ps.id != initial_secondary_id + ][0] + else: + dest_ps = [ + ps + for ps in env.pageservers + if ps.id != initial_ps_id + and ps.az_id == initial_ps_az + and ps.id != initial_secondary_id + ][0] + + log.info( + f"Migrating to {dest_ps.id} in AZ {dest_ps.az_id} (from {initial_ps_id} in AZ {initial_ps_az})" + ) + dest_ps_id = dest_ps.id + + # Set a failpoint so that the migration will block at the point it has a secondary location + for ps in env.pageservers: + ps.http_client().configure_failpoints(("secondary-layer-download-pausable", "pause")) + + # Before migration, our destination has no locations. Guaranteed because any secondary for our + # tenant will be in another AZ. + assert dest_ps.http_client().tenant_list_locations()["tenant_shards"] == [] + + if wrong_az: + # If migrating to the wrong AZ, first check that omitting force flag results in rejection + with pytest.raises(StorageControllerApiException, match="worse-scoring node"): + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=True, override_scheduler=False), + ) + + # Turn off ordinary optimisations so that our migration will stay put once complete + env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Essential"}) + + # We expect this API call to succeed, and result in a new secondary location on the destination + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dest_ps_id, + config=StorageControllerMigrationConfig(prewarm=True, override_scheduler=wrong_az), + ) + + def secondary_at_dest(): + locs = dest_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locs) == 1 + assert locs[0][0] == str(env.initial_tenant) + assert locs[0][1]["mode"] == "Secondary" + + wait_until(secondary_at_dest) + + # Unblock secondary downloads + for ps in env.pageservers: + ps.http_client().configure_failpoints(("secondary-layer-download-pausable", "off")) + + # Pump the reconciler to avoid waiting for background reconciles + env.storage_controller.reconcile_until_idle() + + # We should be attached at the destination + locs = dest_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locs) == 1 + assert locs[0][1]["mode"] == "AttachedSingle" + + # Nothing left behind at the origin + if wrong_az: + # We're in essential scheduling mode, so the end state should be attached in the migration + # destination and a secondary in the original location + assert ( + initial_ps.http_client().tenant_list_locations()["tenant_shards"][0][1]["mode"] + == "Secondary" + ) + else: + assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index d44c176b35..0f4e5688a9 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -312,17 +312,6 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() - for ps in env.pageservers: - # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by - # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. - # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.extend( - [ - ".*could not find data for key.*", - ".*could not ingest record.*", - ] - ) - def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 4865178ca8..b30c02e0e4 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -327,9 +327,9 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}") endpoint.safe_psql(f"create database {dbname}") connstr = endpoint.connstr(dbname=dbname) - # pgbench -i will automatically vacuum the tables. This creates the visibility map. - pg_bin.run(["pgbench", "-i", "-s", "10", connstr]) - # Freeze the tuples to set the initial frozen bit. + # Initialize the data set, but don't vacuum yet. + pg_bin.run(["pgbench", "-i", "-s", "8", "-n", connstr]) + # Vacuum to create the visibility map, and freeze the tuples to set the frozen bit. endpoint.safe_psql("vacuum freeze", dbname=dbname) # Run pgbench. pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr]) @@ -354,19 +354,3 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): row = cur.fetchone() assert row is not None assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" - - # Vacuum and freeze the tables, and check that the visibility map is still accurate. - for dbname in dbnames: - log.info(f"Vacuuming and checking visibility map for {dbname}") - with endpoint.cursor(dbname=dbname) as cur: - cur.execute("vacuum freeze") - - cur.execute("select count(*) from pg_check_visible('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)" - - cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')") - row = cur.fetchone() - assert row is not None - assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)" diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c5045fe4a4..0366e88389 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, NeonEnvBuilder, - NeonPageserver, PgBin, PgProtocol, Safekeeper, @@ -38,8 +37,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, timeline_delete_wait_completed, - wait_for_last_record_lsn, - wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -55,9 +52,16 @@ from fixtures.safekeeper.http import ( TimelineCreateRequest, ) from fixtures.safekeeper.utils import wait_walreceivers_absent +from fixtures.safekeeper_utils import ( + is_flush_lsn_caught_up, + is_segment_offloaded, + is_wal_trimmed, + wait_lsn_force_checkpoint, + wait_lsn_force_checkpoint_at, + wait_lsn_force_checkpoint_at_sk, +) from fixtures.utils import ( PropagatingThread, - get_dir_size, query_scalar, run_only_on_default_postgres, skip_in_debug_build, @@ -69,68 +73,6 @@ if TYPE_CHECKING: from typing import Any, Self -def wait_lsn_force_checkpoint( - tenant_id: TenantId, - timeline_id: TimelineId, - endpoint: Endpoint, - ps: NeonPageserver, - pageserver_conn_options=None, -): - pageserver_conn_options = pageserver_conn_options or {} - lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") - - wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at_sk( - safekeeper: Safekeeper, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) - wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) - - -def wait_lsn_force_checkpoint_at( - lsn: Lsn, - tenant_id: TenantId, - timeline_id: TimelineId, - ps: NeonPageserver, - pageserver_conn_options=None, -): - """ - Wait until pageserver receives given lsn, force checkpoint and wait for - upload, i.e. remote_consistent_lsn advancement. - """ - pageserver_conn_options = pageserver_conn_options or {} - - auth_token = None - if "password" in pageserver_conn_options: - auth_token = pageserver_conn_options["password"] - - # wait for the pageserver to catch up - wait_for_last_record_lsn( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - # force checkpoint to advance remote_consistent_lsn - ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) - - # ensure that remote_consistent_lsn is advanced - wait_for_upload( - ps.http_client(auth_token=auth_token), - tenant_id, - timeline_id, - lsn, - ) - - @dataclass class TimelineMetrics: timeline_id: TimelineId @@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None): wait_f() -def is_segment_offloaded( - sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn -): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.backup_lsn >= seg_end - - -def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"sk status is {tli_status}") - return tli_status.flush_lsn >= lsn - - -def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): - http_cli = sk.http_client() - tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) - sk_wal_size_mb = sk_wal_size / 1024 / 1024 - log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") - return sk_wal_size_mb <= target_size_mb - - def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 remote_storage_kind = s3_storage() @@ -811,60 +728,6 @@ class ProposerPostgres(PgProtocol): self.pg_bin.run(args) -# insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor, -): - # We don't really need the full environment for this test, just the - # safekeepers would be enough. - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - - tenant_id = TenantId.generate() - timeline_id = TimelineId.generate() - - # write config for proposer - pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres( - pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() - ) - pg.create_dir_config(env.get_safekeeper_connstrs()) - - # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = Lsn("0/16B9188") - begin_lsn = epoch_start_lsn - - # append and commit WAL - lsn_after_append = [] - for i in range(3): - res = env.safekeepers[i].append_logical_message( - tenant_id, - timeline_id, - { - "lm_prefix": "prefix", - "lm_message": "message", - "set_commit_lsn": True, - "send_proposer_elected": True, - "term": 2, - "begin_lsn": int(begin_lsn), - "epoch_start_lsn": int(epoch_start_lsn), - "truncate_lsn": int(epoch_start_lsn), - "pg_version": int(env.pg_version) * 10000, - }, - ) - lsn = Lsn(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn) - log.info(f"safekeeper[{i}] lsn after append: {lsn}") - - # run sync safekeepers - lsn_after_sync = pg.sync_safekeepers() - log.info(f"lsn after sync = {lsn_after_sync}") - - assert all(lsn_after_sync == lsn for lsn in lsn_after_append) - - @pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled @@ -1739,214 +1602,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) -@pytest.mark.parametrize("auth_enabled", [False, True]) -def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): - neon_env_builder.auth_enabled = auth_enabled - env = neon_env_builder.init_start() - - # FIXME: are these expected? - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was not found in global map.*", - ".*Timeline .* was cancelled and cannot be used anymore.*", - ] - ) - - # Create two tenants: one will be deleted, other should be preserved. - tenant_id = env.initial_tenant - timeline_id_1 = env.create_branch("br1") # Active, delete explicitly - timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly - timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant - timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant - - tenant_id_other, timeline_id_other = env.create_tenant() - - # Populate branches - endpoint_1 = env.endpoints.create_start("br1") - endpoint_2 = env.endpoints.create_start("br2") - endpoint_3 = env.endpoints.create_start("br3") - endpoint_4 = env.endpoints.create_start("br4") - endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) - for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE t(key int primary key)") - sk = env.safekeepers[0] - sk_data_dir = sk.data_dir - if not auth_enabled: - sk_http = sk.http_client() - sk_http_other = sk_http - else: - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) - ) - sk_http_noauth = sk.http_client(gen_sk_wide_token=False) - assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. - endpoint_2.stop_and_destroy() - endpoint_4.stop_and_destroy() - sk.stop() - sk.start() - - # Ensure connections to Safekeeper are established - for endpoint in [endpoint_1, endpoint_3, endpoint_other]: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (1)") - - # Stop all computes gracefully before safekeepers stop responding to them - endpoint_1.stop_and_destroy() - endpoint_3.stop_and_destroy() - - # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure repeated deletion succeeds - assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - if auth_enabled: - # Ensure we cannot delete the other tenant - for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.tenant_delete_force(tenant_id_other) - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove non-existing branch, should succeed - assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() - assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant fully (two branches are active) - response = sk_http.tenant_delete_force(tenant_id) - assert response[str(timeline_id_3)]["dir_existed"] - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Remove initial tenant again. - response = sk_http.tenant_delete_force(tenant_id) - # assert response == {} - assert not (sk_data_dir / str(tenant_id)).exists() - assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() - - # Ensure the other tenant still works - sk_http_other.timeline_status(tenant_id_other, timeline_id_other) - with closing(endpoint_other.connect()) as conn: - with conn.cursor() as cur: - cur.execute("INSERT INTO t (key) VALUES (123)") - - -def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder): - """ - Test deleting timelines on a safekeeper while they're under load. - - This should not happen under normal operation, but it can happen if - there is some rogue compute/pageserver that is writing/reading to a - safekeeper that we're migrating a timeline away from, or if the timeline - is being deleted while such a rogue client is running. - """ - neon_env_builder.auth_enabled = True - env = neon_env_builder.init_start() - - # Create two endpoints that will generate load - timeline_id_a = env.create_branch("deleteme_a") - timeline_id_b = env.create_branch("deleteme_b") - - endpoint_a = env.endpoints.create("deleteme_a") - endpoint_a.start() - endpoint_b = env.endpoints.create("deleteme_b") - endpoint_b.start() - - # Get tenant and timeline IDs - tenant_id = env.initial_tenant - - # Start generating load on both timelines - def generate_load(endpoint: Endpoint): - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") - while True: - try: - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'") - except: # noqa - # Ignore errors since timeline may be deleted - break - - t_a = threading.Thread(target=generate_load, args=(endpoint_a,)) - t_b = threading.Thread(target=generate_load, args=(endpoint_b,)) - try: - t_a.start() - t_b.start() - - # Let the load run for a bit - log.info("Warming up...") - time.sleep(2) - - # Safekeeper errors will propagate to the pageserver: it is correct that these are - # logged at error severity because they indicate the pageserver is trying to read - # a timeline that it shouldn't. - env.pageserver.allowed_errors.extend( - [ - ".*Timeline.*was cancelled.*", - ".*Timeline.*was not found.*", - ] - ) - - # Try deleting timelines while under load - sk = env.safekeepers[0] - sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) - - # Delete first timeline - log.info(f"Deleting {timeline_id_a}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"] - - # Delete second timeline - log.info(f"Deleting {timeline_id_b}...") - assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"] - - # Verify timelines are gone from disk - sk_data_dir = sk.data_dir - assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists() - # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists() - - finally: - log.info("Stopping endpoints...") - # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang - endpoint_a.stop(mode="immediate") - endpoint_b.stop(mode="immediate") - log.info("Joining threads...") - t_a.join() - t_b.join() - - # Basic pull_timeline test. # When live_sk_change is False, compute is restarted to change set of # safekeepers; otherwise it is live reload. @@ -2269,13 +1924,21 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() + # These are expected after timeline deletion on safekeepers. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline sk = env.safekeepers[0] http_cli = sk.http_client() - sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_1 = SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. @@ -2303,19 +1966,76 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): log.info(f"conf after restart: {after_restart}") assert after_restart.generation == 4 - # Switch into disjoint conf. - non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + # Switch into non joint conf of which sk is not a member, must fail. + non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) + + # Switch into good non joint conf. + non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 - assert resp.current_conf.generation == 5 + assert resp.current_conf.generation == 6 - # Switch request to lower conf should be ignored. - lower_conf = Configuration(generation=3, members=[], new_members=None) - resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) - log.info(f"lower switch resp: {resp}") - assert resp.previous_conf.generation == 5 - assert resp.current_conf.generation == 5 + # Switch request to lower conf should be rejected. + lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + + # Now, exclude sk from the membership, timeline should be deleted. + excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.timeline_status(tenant_id, timeline_id) + + +def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): + """ + Test that having neon.safekeepers starting with g#n: with non zero n enables + generations, which as a side effect disables automatic timeline creation. + + This is kind of bootstrapping test: here membership conf & timeline is + created manually, later storcon will do that. + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps = env.pageservers[0] + ps_http_cli = ps.http_client() + + http_clis = [sk.http_client() for sk in env.safekeepers] + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create("main", config_lines=config_lines) + + # expected to fail because timeline is not created on safekeepers + with pytest.raises(Exception, match=r".*timed out.*"): + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] + mconf = Configuration(generation=1, members=sk_ids, new_members=None) + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk_http_cli in http_clis: + sk_http_cli.timeline_create(create_r) + # Once timeline created endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6254ab9b44..7b7592e740 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d +Subproject commit 7b7592e74059f795b64f06860cea97673418f35e diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9b118b1cff..ee794ba767 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8 +Subproject commit ee794ba767eef9b10260ef67d3a58084f1dabd6f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 799e7a08dd..512856aaa8 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93 +Subproject commit 512856aaa8bedbaa8f06811449518dcb0c2e5d8f diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 517b8dc244..e5e87b9f52 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e +Subproject commit e5e87b9f52d0eaeb83f3e2517bb9727aac37729b diff --git a/vendor/revisions.json b/vendor/revisions.json index 8dde46a01e..1d76e1da01 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "517b8dc244abf3e56f0089849e464af76f70b94e" + "e5e87b9f52d0eaeb83f3e2517bb9727aac37729b" ], "v16": [ "16.8", - "799e7a08dd171aa06a7395dd326f4243aaeb9f93" + "512856aaa8bedbaa8f06811449518dcb0c2e5d8f" ], "v15": [ "15.12", - "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8" + "ee794ba767eef9b10260ef67d3a58084f1dabd6f" ], "v14": [ "14.17", - "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" + "7b7592e74059f795b64f06860cea97673418f35e" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 1b7c376560..4a6ab6e745 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -18,7 +18,7 @@ license.workspace = true ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } -base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21" } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } @@ -47,7 +47,8 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools = { version = "0.12" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -114,7 +115,8 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools = { version = "0.12" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" }