diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 2b96ce95da..1e6c2d0aa2 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -28,3 +28,7 @@ config-variables: - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - SLACK_ON_CALL_STORAGE_STAGING_STREAM - SLACK_CICD_CHANNEL_ID + - SLACK_STORAGE_CHANNEL_ID + - NEON_DEV_AWS_ACCOUNT_ID + - NEON_PROD_AWS_ACCOUNT_ID + - AWS_ECR_REGION diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index c9f6b0832e..a393aa6106 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -19,7 +19,11 @@ inputs: default: '[1, 1]' # settings below only needed if you want the project to be sharded from the beginning shard_split_project: - description: 'by default new projects are not shard-split, specify true to shard-split' + description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially' + required: false + default: 'false' + disable_sharding: + description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding' required: false default: 'false' admin_api_key: @@ -107,6 +111,21 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" fi + if [ "${DISABLE_SHARDING}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" + echo "with body {\"scheduling\": \"Essential\"}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"scheduling\": \"Essential\"}" + fi env: API_HOST: ${{ inputs.api_host }} @@ -116,6 +135,7 @@ runs: MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} + DISABLE_SHARDING: ${{ inputs.disable_sharding }} ADMIN_API_KEY: ${{ inputs.admin_api_key }} SHARD_COUNT: ${{ inputs.shard_count }} STRIPE_SIZE: ${{ inputs.stripe_size }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 86a791497c..3740e6dc9c 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -348,6 +348,10 @@ jobs: rerun_failed: true pg_version: ${{ matrix.pg_version }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. + # Attempt to stop tests gracefully to generate test reports + # until they are forcibly stopped by the stricter `timeout-minutes` limit. + extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 3c97c8a67a..c938f62ad5 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -2,7 +2,7 @@ name: Push images to Container Registry on: workflow_call: inputs: - # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} image-map: description: JSON map of images, mapping from a source image to an array of target images that should be pushed. required: true diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 88cb395958..8f3392ceea 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -68,7 +68,7 @@ jobs: tag: needs: [ check-permissions ] runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -263,8 +263,9 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') - needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs + if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled()) + needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -858,14 +859,17 @@ jobs: BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" + DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: needs: [ generate-image-maps, neon-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -880,8 +884,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -897,8 +901,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -914,8 +918,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -1028,7 +1032,7 @@ jobs: statuses: write contents: write runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - uses: actions/checkout@v4 @@ -1177,6 +1181,22 @@ jobs: exit 1 fi + notify-storage-release-deploy-failure: + needs: [ deploy ] + # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. + if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + runs-on: ubuntu-22.04 + steps: + - name: Post release-deploy failure to team-storage slack channel + uses: slackapi/slack-github-action@v2 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + text: | + 🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ deploy ] @@ -1273,7 +1293,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ] + needs: [ build-build-tools-image, test-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index 2bc938509f..e40b02b5d2 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -27,7 +27,7 @@ env: jobs: tag: runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml new file mode 100644 index 0000000000..71c5158ef6 --- /dev/null +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -0,0 +1,76 @@ +name: Force Test Upgrading of Extension +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '45 2 * * *' # run once a day, timezone is utc + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow + group: ${{ github.workflow }} + cancel-in-progress: true + +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read + +jobs: + regress: + strategy: + fail-fast: false + matrix: + pg-version: [16, 17] + + runs-on: small + + steps: + - uses: actions/checkout@v4 + with: + submodules: false + + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + + - name: Test extension upgrade + timeout-minutes: 20 + env: + NEWTAG: latest + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + PG_VERSION: ${{ matrix.pg-version }} + FORCE_ALL_UPGRADE_TESTS: true + run: ./docker-compose/test_extensions_upgrade.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Post to the Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} + slack-message: | + Test upgrading of extensions: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 7b303fa37a..c20c5890f9 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -32,18 +32,27 @@ jobs: - target_project: new_empty_project_stripe_size_2048 stripe_size: 2048 # 16 MiB postgres_version: 16 + disable_sharding: false - target_project: new_empty_project_stripe_size_32768 stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold # while here it is sharded from the beginning with a shard size of 256 MiB + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 17 - target_project: large_existing_project stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project_unsharded + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: true postgres_version: 16 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: @@ -96,6 +105,7 @@ jobs: admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} shard_count: 8 stripe_size: ${{ matrix.stripe_size }} + disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 626de2b0e0..b305b662ee 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -33,10 +33,6 @@ concurrency: # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned - jobs: check-manifests: runs-on: ubuntu-22.04 @@ -46,11 +42,14 @@ jobs: steps: - name: Check if we really need to pin the image id: check-manifests + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned run: | - docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json - docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" - if diff ${FROM_TAG}.json ${TO_TAG}.json; then + if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true else skip=false @@ -64,55 +63,34 @@ jobs: # use format(..) to catch both inputs.force = true AND inputs.force = 'true' if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' - runs-on: ubuntu-22.04 - permissions: - id-token: write # for `azure/login` and aws auth + id-token: write # Required for aws/azure login - steps: - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - run: | - az acr login --name=neoneastus2 - - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR - env: - DEFAULT_DEBIAN_VERSION: bookworm - run: | - for debian_version in bullseye bookworm; do - tags=() - - tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") - - if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${TO_TAG}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") - fi - - docker buildx imagetools create "${tags[@]}" \ - neondatabase/build-tools:${FROM_TAG}-${debian_version} - done + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "docker.io/neondatabase/build-tools:pinned-bullseye", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" + ], + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "docker.io/neondatabase/build-tools:pinned-bookworm", + "docker.io/neondatabase/build-tools:pinned", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned" + ] + } + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml new file mode 100644 index 0000000000..1e9d2ec5e2 --- /dev/null +++ b/.github/workflows/regenerate-pg-setting.yml @@ -0,0 +1,41 @@ +name: Regenerate Postgres Settings + +on: + pull_request: + types: + - opened + - synchronize + - reopened + paths: + - pgxn/neon/**.c + - vendor/postgres-v* + - vendor/revisions.json + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +permissions: + pull-requests: write + +jobs: + regenerate-pg-settings: + runs-on: ubuntu-22.04 + + steps: + - name: Add comment + uses: thollander/actions-comment-pull-request@v3 + with: + comment-tag: ${{ github.job }} + pr-number: ${{ github.event.number }} + message: | + If this PR added a GUC in the Postgres fork or `neon` extension, + please regenerate the Postgres settings in the `cloud` repo: + + ``` + make NEON_WORKDIR=path/to/neon/checkout \ + -C goapp/internal/shareddomain/postgres generate + ``` + + If you're an external contributor, a Neon employee will assist in + making sure this step is done. diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 27ed1e4cff..be6a7a7901 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -88,7 +88,7 @@ jobs: BUILD_AND_TEST_RUN_ID=${TAG} while true; do gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json - if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then + if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then break fi jq -c '.[]' jobs.json | while read -r job; do diff --git a/Cargo.lock b/Cargo.lock index 407c8170bb..12232eaece 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -786,7 +786,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-trait", "base64 0.22.1", @@ -815,7 +815,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-lock", "async-trait", @@ -834,7 +834,7 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "async-lock", @@ -852,7 +852,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "azure_core", @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "azure_core", "bytes", @@ -1029,12 +1029,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "boxcar" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42" - [[package]] name = "bstr" version = "1.5.0" @@ -1293,6 +1287,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "jsonwebtoken", "regex", "remote_storage", "serde", @@ -1308,6 +1303,7 @@ dependencies = [ "aws-config", "aws-sdk-kms", "aws-sdk-s3", + "aws-smithy-types", "axum", "base64 0.13.1", "bytes", @@ -1329,7 +1325,6 @@ dependencies = [ "opentelemetry_sdk", "postgres", "postgres_initdb", - "prometheus", "regex", "remote_storage", "reqwest", @@ -1348,13 +1343,13 @@ dependencies = [ "tower 0.5.2", "tower-http", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", "utils", "uuid", "vm_monitor", + "walkdir", "workspace_hack", "zstd", ] @@ -4927,7 +4922,6 @@ dependencies = [ "aws-sdk-iam", "aws-sigv4", "base64 0.13.1", - "boxcar", "bstr", "bytes", "camino", @@ -4979,7 +4973,6 @@ dependencies = [ "postgres-protocol2", "postgres_backend", "pq_proto", - "prometheus", "rand 0.8.5", "rand_distr", "rcgen", @@ -5004,7 +4997,6 @@ dependencies = [ "smallvec", "smol_str", "socket2", - "strum", "strum_macros", "subtle", "thiserror 1.0.69", @@ -5019,7 +5011,6 @@ dependencies = [ "tracing", "tracing-log", "tracing-opentelemetry", - "tracing-serde", "tracing-subscriber", "tracing-utils", "try-lock", @@ -6460,10 +6451,13 @@ dependencies = [ "pageserver_client", "postgres_connection", "rand 0.8.5", + "regex", "reqwest", "routerify", "rustls 0.23.18", "rustls-native-certs 0.8.0", + "safekeeper_api", + "safekeeper_client", "scoped-futures", "scopeguard", "serde", @@ -6471,6 +6465,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -7024,14 +7019,11 @@ dependencies = [ name = "tokio-postgres2" version = "0.1.0" dependencies = [ - "async-trait", - "byteorder", "bytes", "fallible-iterator", "futures-util", "log", "parking_lot 0.12.1", - "percent-encoding", "phf", "pin-project-lite", "postgres-protocol2", @@ -7618,7 +7610,6 @@ dependencies = [ "hex", "hex-literal", "humantime", - "inferno 0.12.0", "jsonwebtoken", "metrics", "nix 0.27.1", diff --git a/Dockerfile b/Dockerfile index b399bcf7e4..83ad86badb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,14 @@ RUN set -e \ && rm -rf pg_install/build \ && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . +# Prepare cargo-chef recipe +FROM $REPOSITORY/$IMAGE:$TAG AS plan +WORKDIR /home/nonroot + +COPY --chown=nonroot . . + +RUN cargo chef prepare --recipe-path recipe.json + # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot @@ -63,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib +COPY --from=plan /home/nonroot/recipe.json recipe.json + +ARG ADDITIONAL_RUSTFLAGS="" + +RUN set -e \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json + COPY --chown=nonroot . . -ARG ADDITIONAL_RUSTFLAGS RUN set -e \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index fa72ca1bc2..317eded26e 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -300,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33 ARG CARGO_DENY_VERSION=0.16.2 ARG CARGO_HACK_VERSION=0.6.33 ARG CARGO_NEXTEST_VERSION=0.9.85 +ARG CARGO_CHEF_VERSION=0.1.71 ARG CARGO_DIESEL_CLI_VERSION=2.2.6 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ @@ -314,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ + cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \ cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \ --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6814aadcb9..0b3001613d 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \ + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* @@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control +######################################################################################### +# +# Layer "pg-duckdb-pg-build" +# compile pg_duckdb extension +# +######################################################################################### +FROM build-deps AS pg_duckdb-src +WORKDIR /ext-src +COPY compute/patches/pg_duckdb_v031.patch . +# pg_duckdb build requires source dir to be a git repo to get submodules +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# - extension management function duckdb.install_extension() +# - access to duckdb.extensions table and its sequence +RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ + cd pg_duckdb-src && \ + git submodule update --init --recursive && \ + patch -p1 < /ext-src/pg_duckdb_v031.patch + +FROM pg-build AS pg_duckdb-build +ARG PG_VERSION +COPY --from=pg_duckdb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_duckdb-src +RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1484,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install + +######################################################################################### +# +# Layer "pgaudit" +# compile pgaudit extension +# +######################################################################################### + +FROM build-deps AS pgaudit-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14") \ + export PGAUDIT_VERSION=1.6.2 \ + export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + ;; \ + "v15") \ + export PGAUDIT_VERSION=1.7.0 \ + export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + ;; \ + "v16") \ + export PGAUDIT_VERSION=16.0 \ + export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + ;; \ + "v17") \ + export PGAUDIT_VERSION=17.0 \ + export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + ;; \ + *) \ + echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ + echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgaudit-build +COPY --from=pgaudit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgaudit-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + +######################################################################################### +# +# Layer "pgauditlogtofile" +# compile pgauditlogtofile extension +# +######################################################################################### + +FROM build-deps AS pgauditlogtofile-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16" | "v17") \ + export PGAUDITLOGTOFILE_VERSION=v1.6.4 \ + export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \ + ;; \ + *) \ + echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \ + echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \ + mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgauditlogtofile-build +COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgauditlogtofile-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + ######################################################################################### # # Layer "neon-ext-build" @@ -1577,7 +1669,10 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ ######################################################################################### # @@ -1669,29 +1764,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\ && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - -######################################################################################### -# -# Layer "awscli" -# -######################################################################################### -FROM build-deps AS awscli -ARG TARGETARCH -RUN set -ex; \ - if [ "${TARGETARCH}" = "amd64" ]; then \ - TARGETARCH_ALT="x86_64"; \ - CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ - elif [ "${TARGETARCH}" = "arm64" ]; then \ - TARGETARCH_ALT="aarch64"; \ - CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ - else \ - echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ - fi; \ - curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ - echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ - unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ - /tmp/awscliv2/aws/install; \ - rm -rf /tmp/awscliv2.zip /tmp/awscliv2 - ######################################################################################### # # Clean up postgres folder before inclusion @@ -1750,7 +1822,7 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/ COPY --from=hypopg-src /ext-src/ /ext-src/ COPY --from=pg_hashids-src /ext-src/ /ext-src/ COPY --from=rum-src /ext-src/ /ext-src/ -#COPY --from=pgtap-src /ext-src/ /ext-src/ +COPY --from=pgtap-src /ext-src/ /ext-src/ COPY --from=ip4r-src /ext-src/ /ext-src/ COPY --from=prefix-src /ext-src/ /ext-src/ COPY --from=hll-src /ext-src/ /ext-src/ @@ -1775,11 +1847,14 @@ COPY --from=pg_partman-src /ext-src/ /ext-src/ #COPY --from=pg_repack-src /ext-src/ /ext-src/ COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh +RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\ + && apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH ENV PGHOST=compute ENV PGPORT=55433 ENV PGUSER=cloud_admin ENV PGDATABASE=postgres +ENV PG_VERSION=${PG_VERSION:?} ######################################################################################### # @@ -1861,9 +1936,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ mkdir /usr/local/download_extensions && \ chown -R postgres:postgres /usr/local/download_extensions -# aws cli is used by fast_import -COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli - # pgbouncer and its config COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch new file mode 100644 index 0000000000..a7e188d69e --- /dev/null +++ b/compute/patches/pg_duckdb_v031.patch @@ -0,0 +1,11 @@ +diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql +index d777d76..af60106 100644 +--- a/sql/pg_duckdb--0.2.0--0.3.0.sql ++++ b/sql/pg_duckdb--0.2.0--0.3.0.sql +@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; + GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; ++GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser; ++GRANT ALL ON TABLE duckdb.extensions TO neon_superuser; ++GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser; diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 124c40cf5d..6617c98599 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -47,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b04f364cbb..c276996df5 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -14,6 +14,7 @@ base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true +aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } camino.workspace = true @@ -46,13 +47,12 @@ tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true tracing.workspace = true -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true uuid.workspace = true -prometheus.workspace = true +walkdir.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index df47adda6c..a8803ec793 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -55,7 +55,7 @@ use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info, warn}; use url::Url; -use compute_api::responses::ComputeStatus; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; use compute_api::spec::ComputeSpec; use compute_tools::compute::{ @@ -281,6 +281,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result { info!("got spec from cli argument {}", spec_json); return Ok(CliSpecParams { spec: Some(serde_json::from_str(spec_json)?), + compute_ctl_config: ComputeCtlConfig::default(), live_config_allowed: false, }); } @@ -290,6 +291,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result { let file = File::open(Path::new(spec_path))?; return Ok(CliSpecParams { spec: Some(serde_json::from_reader(file)?), + compute_ctl_config: ComputeCtlConfig::default(), live_config_allowed: true, }); } @@ -299,8 +301,9 @@ fn try_spec_from_cli(cli: &Cli) -> Result { }; match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { - Ok(spec) => Ok(CliSpecParams { - spec, + Ok(resp) => Ok(CliSpecParams { + spec: resp.0, + compute_ctl_config: resp.1, live_config_allowed: true, }), Err(e) => { @@ -317,6 +320,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result { struct CliSpecParams { /// If a spec was provided via CLI or file, the [`ComputeSpec`] spec: Option, + #[allow(dead_code)] + compute_ctl_config: ComputeCtlConfig, live_config_allowed: bool, } @@ -326,6 +331,7 @@ fn wait_spec( CliSpecParams { spec, live_config_allowed, + compute_ctl_config: _, }: CliSpecParams, ) -> Result> { let mut new_state = ComputeState::new(); diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 27cf1c2317..585f3e4e1d 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,10 +25,10 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::Context; +use anyhow::{bail, Context}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; -use clap::Parser; +use clap::{Parser, Subcommand}; use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; use nix::unistd::Pid; use tracing::{error, info, info_span, warn, Instrument}; @@ -44,32 +44,59 @@ mod s3_uri; const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); +#[derive(Subcommand, Debug)] +enum Command { + /// Runs local postgres (neon binary), restores into it, + /// uploads pgdata to s3 to be consumed by pageservers + Pgdata { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// If specified, will not shut down the local postgres after the import. Used in local testing + #[clap(short, long)] + interactive: bool, + /// Port to run postgres on. Default is 5432. + #[clap(long, default_value_t = 5432)] + pg_port: u16, // port to run postgres on, 5432 is default + + /// Number of CPUs in the system. This is used to configure # of + /// parallel worker processes, for index creation. + #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] + num_cpus: Option, + + /// Amount of RAM in the system. This is used to configure shared_buffers + /// and maintenance_work_mem. + #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] + memory_mb: Option, + }, + + /// Runs pg_dump-pg_restore from source to destination without running local postgres. + DumpRestore { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// Raw connection string to the destination database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + destination_connection_string: Option, + }, +} + #[derive(clap::Parser)] struct Args { - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_WORKDIR")] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] s3_prefix: Option, - #[clap(long)] - source_connection_string: Option, - #[clap(short, long)] - interactive: bool, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")] pg_bin_dir: Utf8PathBuf, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")] pg_lib_dir: Utf8PathBuf, - #[clap(long)] - pg_port: Option, // port to run postgres on, 5432 is default - /// Number of CPUs in the system. This is used to configure # of - /// parallel worker processes, for index creation. - #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] - num_cpus: Option, - - /// Amount of RAM in the system. This is used to configure shared_buffers - /// and maintenance_work_mem. - #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] - memory_mb: Option, + #[clap(subcommand)] + command: Command, } #[serde_with::serde_as] @@ -78,6 +105,8 @@ struct Spec { encryption_secret: EncryptionSecret, #[serde_as(as = "serde_with::base64::Base64")] source_connstring_ciphertext_base64: Vec, + #[serde_as(as = "Option")] + destination_connstring_ciphertext_base64: Option>, } #[derive(serde::Deserialize)] @@ -93,192 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { "C.UTF-8" }; -#[tokio::main] -pub(crate) async fn main() -> anyhow::Result<()> { - utils::logging::init( - utils::logging::LogFormat::Plain, - utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, - utils::logging::Output::Stdout, - )?; - - info!("starting"); - - let args = Args::parse(); - - // Validate arguments - if args.s3_prefix.is_none() && args.source_connection_string.is_none() { - anyhow::bail!("either s3_prefix or source_connection_string must be specified"); - } - if args.s3_prefix.is_some() && args.source_connection_string.is_some() { - anyhow::bail!("only one of s3_prefix or source_connection_string can be specified"); - } - - let working_directory = args.working_directory; - let pg_bin_dir = args.pg_bin_dir; - let pg_lib_dir = args.pg_lib_dir; - let pg_port = args.pg_port.unwrap_or_else(|| { - info!("pg_port not specified, using default 5432"); - 5432 - }); - - // Initialize AWS clients only if s3_prefix is specified - let (aws_config, kms_client) = if args.s3_prefix.is_some() { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let kms = aws_sdk_kms::Client::new(&config); - (Some(config), Some(kms)) - } else { - (None, None) - }; - - // Get source connection string either from S3 spec or direct argument - let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix { - let spec: Spec = { - let spec_key = s3_prefix.append("/spec.json"); - let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap()); - let object = s3_client - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? - }; - - match spec.encryption_secret { - EncryptionSecret::KMS { key_id } => { - let mut output = kms_client - .unwrap() - .decrypt() - .key_id(key_id) - .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( - spec.source_connstring_ciphertext_base64, - )) - .send() - .await - .context("decrypt source connection string")?; - let plaintext = output - .plaintext - .take() - .context("get plaintext source connection string")?; - String::from_utf8(plaintext.into_inner()) - .context("parse source connection string as utf8")? - } - } - } else { - args.source_connection_string.unwrap() - }; - - match tokio::fs::create_dir(&working_directory).await { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - if !is_directory_empty(&working_directory) - .await - .context("check if working directory is empty")? - { - anyhow::bail!("working directory is not empty"); - } else { - // ok - } - } - Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), - } - - let pgdata_dir = working_directory.join("pgdata"); - tokio::fs::create_dir(&pgdata_dir) +async fn decode_connstring( + kms_client: &aws_sdk_kms::Client, + key_id: &String, + connstring_ciphertext_base64: Vec, +) -> Result { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + connstring_ciphertext_base64, + )) + .send() .await - .context("create pgdata directory")?; + .context("decrypt connection string")?; - let pgbin = pg_bin_dir.join("postgres"); - let pg_version = match get_pg_version(pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; - let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded - postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { - superuser, - locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, - pg_version, - initdb_bin: pg_bin_dir.join("initdb").as_ref(), - library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. - pgdata: &pgdata_dir, - }) - .await - .context("initdb")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext connection string")?; - // If the caller didn't specify CPU / RAM to use for sizing, default to - // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM. - let nproc = args.num_cpus.unwrap_or_else(num_cpus::get); - let memory_mb = args.memory_mb.unwrap_or(256); + String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8") +} - // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for - // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest - // available for misc other stuff that PostgreSQL uses memory for. - let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; - let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; +struct PostgresProcess { + pgdata_dir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pgbin: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + postgres_proc: Option, +} - // - // Launch postgres process - // - let mut postgres_proc = tokio::process::Command::new(pgbin) - .arg("-D") - .arg(&pgdata_dir) - .args(["-p", &format!("{pg_port}")]) - .args(["-c", "wal_level=minimal"]) - .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) - .args(["-c", "max_wal_senders=0"]) - .args(["-c", "fsync=off"]) - .args(["-c", "full_page_writes=off"]) - .args(["-c", "synchronous_commit=off"]) - .args([ - "-c", - &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), - ]) - .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) - .args(["-c", &format!("max_worker_processes={nproc}")]) - .args([ - "-c", - &format!( - "effective_io_concurrency={}", - if cfg!(target_os = "macos") { 0 } else { 100 } - ), - ]) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir) - .env( - "ASAN_OPTIONS", - std::env::var("ASAN_OPTIONS").unwrap_or_default(), +impl PostgresProcess { + fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self { + Self { + pgdata_dir, + pgbin: pg_bin_dir.join("postgres"), + pg_bin_dir, + pg_lib_dir, + postgres_proc: None, + } + } + + async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> { + tokio::fs::create_dir(&self.pgdata_dir) + .await + .context("create pgdata directory")?; + + let pg_version = match get_pg_version(self.pgbin.as_ref()) { + PostgresMajorVersion::V14 => 14, + PostgresMajorVersion::V15 => 15, + PostgresMajorVersion::V16 => 16, + PostgresMajorVersion::V17 => 17, + }; + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: initdb_user, + locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, + pg_version, + initdb_bin: self.pg_bin_dir.join("initdb").as_ref(), + library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &self.pgdata_dir, + }) + .await + .context("initdb") + } + + async fn start( + &mut self, + initdb_user: &str, + port: u16, + nproc: usize, + memory_mb: usize, + ) -> Result<&tokio::process::Child, anyhow::Error> { + self.prepare(initdb_user).await?; + + // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for + // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest + // available for misc other stuff that PostgreSQL uses memory for. + let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; + let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; + + // + // Launch postgres process + // + let mut proc = tokio::process::Command::new(&self.pgbin) + .arg("-D") + .arg(&self.pgdata_dir) + .args(["-p", &format!("{port}")]) + .args(["-c", "wal_level=minimal"]) + .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args([ + "-c", + &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), + ]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .env("LD_LIBRARY_PATH", &self.pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take()) + .instrument(info_span!("postgres")), + ); + + self.postgres_proc = Some(proc); + Ok(self.postgres_proc.as_ref().unwrap()) + } + + async fn shutdown(&mut self) -> Result<(), anyhow::Error> { + let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap(); + info!("shutdown postgres"); + nix::sys::signal::kill( + Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")), + nix::sys::signal::SIGTERM, ) - .env( - "UBSAN_OPTIONS", - std::env::var("UBSAN_OPTIONS").unwrap_or_default(), - ) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("spawn postgres")?; - - info!("spawned postgres, waiting for it to become ready"); - tokio::spawn( - child_stdio_to_log::relay_process_output( - postgres_proc.stdout.take(), - postgres_proc.stderr.take(), - ) - .instrument(info_span!("postgres")), - ); + .context("signal postgres to shut down")?; + proc.wait() + .await + .context("wait for postgres to shut down") + .map(|_| ()) + } +} +async fn wait_until_ready(connstring: String, create_dbname: String) { // Create neondb database in the running postgres - let restore_pg_connstring = - format!("host=localhost port={pg_port} user={superuser} dbname=postgres"); - let start_time = std::time::Instant::now(); loop { @@ -289,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { std::process::exit(1); } - match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await { + match tokio_postgres::connect( + &connstring.replace("dbname=neondb", "dbname=postgres"), + tokio_postgres::NoTls, + ) + .await + { Ok((client, connection)) => { // Spawn the connection handling task to maintain the connection tokio::spawn(async move { @@ -298,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { } }); - match client.simple_query("CREATE DATABASE neondb;").await { + match client + .simple_query(format!("CREATE DATABASE {create_dbname};").as_str()) + .await + { Ok(_) => { - info!("created neondb database"); + info!("created {} database", create_dbname); break; } Err(e) => { @@ -324,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> { } } } +} - let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb"); - - let dumpdir = working_directory.join("dumpdir"); +async fn run_dump_restore( + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + source_connstring: String, + destination_connstring: String, +) -> Result<(), anyhow::Error> { + let dumpdir = workdir.join("dumpdir"); let common_args = [ // schema mapping (prob suffices to specify them on one side) @@ -356,10 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg("--no-sync") // POSITIONAL args // source db (db name included in connection string) - .arg(&source_connection_string) + .arg(&source_connstring) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -376,24 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_dump.wait().await.context("wait for pg_dump")?; info!(status=?st, "pg_dump exited"); if !st.success() { - warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + error!(status=%st, "pg_dump failed, restore will likely fail as well"); + bail!("pg_dump failed"); } } - // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: maybe do it in a streaming way, plenty of internal research done on this already // TODO: do the unlogged table trick - - info!("restore from working directory into vanilla postgres"); { let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) .args(&common_args) .arg("-d") - .arg(&restore_pg_connstring) + .arg(&destination_connstring) // POSITIONAL args .arg(&dumpdir) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -411,48 +427,259 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_restore.wait().await.context("wait for pg_restore")?; info!(status=?st, "pg_restore exited"); if !st.success() { - warn!(status=%st, "pg_restore failed, restore will likely fail as well"); - } - } - - // If interactive mode, wait for Ctrl+C - if args.interactive { - info!("Running in interactive mode. Press Ctrl+C to shut down."); - tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; - } - - info!("shutdown postgres"); - { - nix::sys::signal::kill( - Pid::from_raw( - i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), - ), - nix::sys::signal::SIGTERM, - ) - .context("signal postgres to shut down")?; - postgres_proc - .wait() - .await - .context("wait for postgres to shut down")?; - } - - // Only sync if s3_prefix was specified - if let Some(s3_prefix) = args.s3_prefix { - info!("upload pgdata"); - aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) - .await - .context("sync dump directory to destination")?; - - info!("write status"); - { - let status_dir = working_directory.join("status"); - std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("pgdata"); - std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) - .context("write status file")?; - aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) - .await - .context("sync status directory to destination")?; + error!(status=%st, "pg_restore failed, restore will likely fail as well"); + bail!("pg_restore failed"); + } + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +async fn cmd_pgdata( + s3_client: Option, + kms_client: Option, + maybe_s3_prefix: Option, + maybe_spec: Option, + source_connection_string: Option, + interactive: bool, + pg_port: u16, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + num_cpus: Option, + memory_mb: Option, +) -> Result<(), anyhow::Error> { + if maybe_spec.is_none() && source_connection_string.is_none() { + bail!("spec must be provided for pgdata command"); + } + if maybe_spec.is_some() && source_connection_string.is_some() { + bail!("only one of spec or source_connection_string can be provided"); + } + + let source_connection_string = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await? + } + } + } else { + source_connection_string.unwrap() + }; + + let superuser = "cloud_admin"; + let destination_connstring = format!( + "host=localhost port={} user={} dbname=neondb", + pg_port, superuser + ); + + let pgdata_dir = workdir.join("pgdata"); + let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); + let nproc = num_cpus.unwrap_or_else(num_cpus::get); + let memory_mb = memory_mb.unwrap_or(256); + proc.start(superuser, pg_port, nproc, memory_mb).await?; + wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await; + + run_dump_restore( + workdir.clone(), + pg_bin_dir, + pg_lib_dir, + source_connection_string, + destination_connstring, + ) + .await?; + + // If interactive mode, wait for Ctrl+C + if interactive { + info!("Running in interactive mode. Press Ctrl+C to shut down."); + tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; + } + + proc.shutdown().await?; + + // Only sync if s3_prefix was specified + if let Some(s3_prefix) = maybe_s3_prefix { + info!("upload pgdata"); + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + Utf8Path::new(&pgdata_dir), + &s3_prefix.append("/pgdata/"), + ) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = workdir.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("pgdata"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + &status_dir, + &s3_prefix.append("/status/"), + ) + .await + .context("sync status directory to destination")?; + } + } + + Ok(()) +} + +async fn cmd_dumprestore( + kms_client: Option, + maybe_spec: Option, + source_connection_string: Option, + destination_connection_string: Option, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, +) -> Result<(), anyhow::Error> { + let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let source = decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await?; + + let dest = if let Some(dest_ciphertext) = + spec.destination_connstring_ciphertext_base64 + { + decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) + .await? + } else { + bail!("destination connection string must be provided in spec for dump_restore command"); + }; + + (source, dest) + } + } + } else { + ( + source_connection_string.unwrap(), + if let Some(val) = destination_connection_string { + val + } else { + bail!("destination connection string must be provided for dump_restore command"); + }, + ) + }; + + run_dump_restore( + workdir, + pg_bin_dir, + pg_lib_dir, + source_connstring, + destination_connstring, + ) + .await +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Json, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let args = Args::parse(); + + // Initialize AWS clients only if s3_prefix is specified + let (s3_client, kms_client) = if args.s3_prefix.is_some() { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let s3_client = aws_sdk_s3::Client::new(&config); + let kms = aws_sdk_kms::Client::new(&config); + (Some(s3_client), Some(kms)) + } else { + (None, None) + }; + + let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { + let spec_key = s3_prefix.append("/spec.json"); + let object = s3_client + .as_ref() + .unwrap() + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + } else { + None + }; + + match tokio::fs::create_dir(&args.working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&args.working_directory) + .await + .context("check if working directory is empty")? + { + bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + match args.command { + Command::Pgdata { + source_connection_string, + interactive, + pg_port, + num_cpus, + memory_mb, + } => { + cmd_pgdata( + s3_client, + kms_client, + args.s3_prefix, + spec, + source_connection_string, + interactive, + pg_port, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + num_cpus, + memory_mb, + ) + .await?; + } + Command::DumpRestore { + source_connection_string, + destination_connection_string, + } => { + cmd_dumprestore( + kms_client, + spec, + source_connection_string, + destination_connection_string, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + ) + .await?; } } diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 5fa58c8f87..1be10b36d6 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,24 +1,102 @@ -use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::task::JoinSet; +use walkdir::WalkDir; use super::s3_uri::S3Uri; -pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("aws"); - builder - .arg("s3") - .arg("sync") - .arg(local.as_str()) - .arg(remote.to_string()); - let st = builder - .spawn() - .context("spawn aws s3 sync")? - .wait() - .await - .context("wait for aws s3 sync")?; - if st.success() { - Ok(()) - } else { - Err(anyhow::anyhow!("aws s3 sync failed")) +use tracing::{info, warn}; + +const MAX_PARALLEL_UPLOADS: usize = 10; + +/// Upload all files from 'local' to 'remote' +pub(crate) async fn upload_dir_recursive( + s3_client: &aws_sdk_s3::Client, + local: &Utf8Path, + remote: &S3Uri, +) -> anyhow::Result<()> { + // Recursively scan directory + let mut dirwalker = WalkDir::new(local) + .into_iter() + .map(|entry| { + let entry = entry?; + let file_type = entry.file_type(); + let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf(); + Ok((file_type, path)) + }) + .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| { + match e { + Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)), + Ok((file_type, _path)) if file_type.is_dir() => { + // The WalkDir iterator will recurse into directories, but we don't want + // to do anything with directories as such. There's no concept of uploading + // an empty directory to S3. + None + } + Ok((file_type, path)) if file_type.is_symlink() => { + // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip. + warn!("cannot upload symlink ({})", path); + None + } + Ok((_file_type, path)) => { + // should not happen + warn!("directory entry has unexpected type ({})", path); + None + } + Err(e) => Some(Err(e)), + } + }); + + // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in + // parallel. + let mut joinset = JoinSet::new(); + loop { + // Could we upload more? + while joinset.len() < MAX_PARALLEL_UPLOADS { + if let Some(full_local_path) = dirwalker.next() { + let full_local_path = full_local_path?; + let relative_local_path = full_local_path + .strip_prefix(local) + .expect("all paths start from the walkdir root"); + let remote_path = remote.append(relative_local_path.as_str()); + info!( + "starting upload of {} to {}", + &full_local_path, &remote_path + ); + let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path); + joinset.spawn(upload_task); + } else { + info!("draining upload tasks"); + break; + } + } + + // Wait for an upload to complete + if let Some(res) = joinset.join_next().await { + let _ = res?; + } else { + // all done! + break; + } } + Ok(()) +} + +pub(crate) async fn upload_file( + s3_client: aws_sdk_s3::Client, + local_path: Utf8PathBuf, + remote: S3Uri, +) -> anyhow::Result<()> { + use aws_smithy_types::byte_stream::ByteStream; + let stream = ByteStream::from_path(&local_path).await?; + + let _result = s3_client + .put_object() + .bucket(remote.bucket) + .key(&remote.key) + .body(stream) + .send() + .await?; + info!("upload of {} to {} finished", &local_path, &remote.key); + + Ok(()) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 73950cd95a..6f28bd9733 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -11,7 +11,9 @@ use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; +use compute_api::responses::{ + ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, +}; use compute_api::spec::ComputeSpec; // Do control plane request and return response if any. In case of error it @@ -73,14 +75,13 @@ fn do_control_plane_request( pub fn get_spec_from_control_plane( base_uri: &str, compute_id: &str, -) -> Result> { +) -> Result<(Option, ComputeCtlConfig)> { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), }; let mut attempt = 1; - let mut spec: Result> = Ok(None); info!("getting spec from control plane: {}", cp_uri); @@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane( // - no spec for compute yet (Empty state) -> return Ok(None) // - got spec -> return Ok(Some(spec)) while attempt < 4 { - spec = match do_control_plane_request(&cp_uri, &jwt) { + let result = match do_control_plane_request(&cp_uri, &jwt) { Ok(spec_resp) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[ @@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane( ]) .inc(); match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)), ControlPlaneComputeStatus::Attached => { if let Some(spec) = spec_resp.spec { - Ok(Some(spec)) + Ok((Some(spec), spec_resp.compute_ctl_config)) } else { bail!("compute is attached, but spec is empty") } @@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane( } }; - if let Err(e) = &spec { + if let Err(e) = &result { error!("attempt {} to get spec failed with: {}", attempt, e); } else { - return spec; + return result; } attempt += 1; @@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane( } // All attempts failed, return error. - spec + Err(anyhow::anyhow!( + "Exhausted all attempts to retrieve the spec from the control plane" + )) } /// Check `pg_hba.conf` and update if needed to allow external connections. diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 3b2634204c..c3c8229c38 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -48,6 +48,8 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::Database; use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; @@ -880,10 +882,13 @@ impl Endpoint { self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .body(format!( - "{{\"spec\":{}}}", - serde_json::to_string_pretty(&spec)? - )) + .body( + serde_json::to_string(&ConfigurationRequest { + spec, + compute_ctl_config: ComputeCtlConfig::default(), + }) + .unwrap(), + ) .send() .await?; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 9a2d30c861..0fadb9c5fe 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -838,7 +838,10 @@ impl StorageController { self.dispatch( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { node_id }), + Some(TenantShardMigrateRequest { + node_id, + migration_config: None, + }), ) .await } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 985fe6b3b1..3c574efc63 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -22,7 +22,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, @@ -239,6 +239,19 @@ enum Command { #[arg(long)] scheduling_policy: SkSchedulingPolicyArg, }, + /// Downloads any missing heatmap layers for all shard for a given timeline + DownloadHeatmapLayers { + /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified, + /// the operation is performed on all shards. When a sharded tenant ID is + /// specified, the operation is only performed on the specified shard. + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + timeline_id: TimelineId, + /// Optional: Maximum download concurrency (default is 16) + #[arg(long)] + concurrency: Option, + }, } #[derive(Parser)] @@ -609,7 +622,10 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { node_id: node }; + let req = TenantShardMigrateRequest { + node_id: node, + migration_config: None, + }; storcon_client .dispatch::( @@ -623,7 +639,10 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { node_id: node }; + let req = TenantShardMigrateRequest { + node_id: node, + migration_config: None, + }; storcon_client .dispatch::( @@ -1082,7 +1101,10 @@ async fn main() -> anyhow::Result<()> { .dispatch::( Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), - Some(TenantShardMigrateRequest { node_id: mv.to }), + Some(TenantShardMigrateRequest { + node_id: mv.to, + migration_config: None, + }), ) .await .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) @@ -1238,6 +1260,24 @@ async fn main() -> anyhow::Result<()> { String::from(scheduling_policy) ); } + Command::DownloadHeatmapLayers { + tenant_shard_id, + timeline_id, + concurrency, + } => { + let mut path = format!( + "/v1/tenant/{}/timeline/{}/download_heatmap_layers", + tenant_shard_id, timeline_id, + ); + + if let Some(c) = concurrency { + path = format!("{path}?concurrency={c}"); + } + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index c4ff86ab66..dd520d4986 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)" # We are running tests now rm -f testout.txt testout_contrib.txt - docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ + docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch new file mode 100644 index 0000000000..2d0bf280db --- /dev/null +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch @@ -0,0 +1,24 @@ +diff --git a/test/sql/base.sql b/test/sql/base.sql +index 53adb30..2eed91b 100644 +--- a/test/sql/base.sql ++++ b/test/sql/base.sql +@@ -2,7 +2,6 @@ + BEGIN; + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(334); + --SELECT * FROM no_plan(); +diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql +index c0fe98e..39cdd2e 100644 +--- a/test/sql/corpus.sql ++++ b/test/sql/corpus.sql +@@ -4,7 +4,6 @@ BEGIN; + -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/. + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(76); + --SELECT * FROM no_plan(); diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh index e1541f272a..18b2848fd1 100755 --- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh @@ -1,6 +1,7 @@ #!/bin/sh set -ex cd "$(dirname ${0})" -patch -p1 , } +#[derive(Debug, Deserialize, Serialize)] +pub struct ComputeCtlConfig { + pub jwks: JwkSet, +} + +impl Default for ComputeCtlConfig { + fn default() -> Self { + Self { + jwks: JwkSet { + keys: Vec::default(), + }, + } + } +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. -/// This is not actually a compute API response, so consider moving -/// to a different place. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, + pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs index dd57f9ed4b..fe1cc10838 100644 --- a/libs/http-utils/src/pprof.rs +++ b/libs/http-utils/src/pprof.rs @@ -2,7 +2,6 @@ use anyhow::bail; use flate2::write::{GzDecoder, GzEncoder}; use flate2::Compression; use itertools::Itertools as _; -use once_cell::sync::Lazy; use pprof::protos::{Function, Line, Location, Message as _, Profile}; use regex::Regex; @@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result { // Resolve the line and function for each location. backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symname) = symbol.name() else { + let Some(symbol_name) = symbol.name() else { return; }; - let mut name = symname.to_string(); - // Strip the Rust monomorphization suffix from the symbol name. - static SUFFIX_REGEX: Lazy = - Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex")); - if let Some(m) = SUFFIX_REGEX.find(&name) { - name.truncate(m.start()); - } - - let function_id = match functions.get(&name) { - Some(function) => function.id, - None => { - let id = functions.len() as u64 + 1; - let system_name = String::from_utf8_lossy(symname.as_bytes()); + let function_name = format!("{symbol_name:#}"); + let functions_len = functions.len(); + let function_id = functions + .entry(function_name) + .or_insert_with_key(|function_name| { + let function_id = functions_len as u64 + 1; + let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); let filename = symbol .filename() .map(|path| path.to_string_lossy()) .unwrap_or(Cow::Borrowed("")); - let function = Function { - id, - name: string_id(&name), + Function { + id: function_id, + name: string_id(function_name), system_name: string_id(&system_name), filename: string_id(&filename), ..Default::default() - }; - functions.insert(name, function); - id - } - }; + } + }) + .id; loc.line.push(Line { function_id, line: symbol.lineno().unwrap_or(0) as i64, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 79f068a47b..0f33bcf45b 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -351,7 +351,7 @@ pub struct TenantConfigToml { /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. - pub rel_size_v2_enabled: Option, + pub rel_size_v2_enabled: bool, // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. @@ -544,10 +544,11 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on - // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole - // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. - pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's + // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could + // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So + // with this config, we can get a maximum peak compaction usage of 9 GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; @@ -633,7 +634,7 @@ impl Default for TenantConfigToml { lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: true, wal_receiver_protocol_override: None, - rel_size_v2_enabled: None, + rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 78e080981a..42f6e47e63 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -182,6 +182,18 @@ pub struct TenantDescribeResponseShard { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { pub node_id: NodeId, + #[serde(default)] + pub migration_config: Option, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MigrationConfig { + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_warmup_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_download_request_timeout: Option, } #[derive(Serialize, Clone, Debug)] diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index dbd45da314..b88a2e46a1 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,10 +1,12 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::Oid; use postgres_ffi::RepOriginId; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; +use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62; /// The key prefix of ReplOrigin keys. pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; +/// The key prefix of db directory keys. +pub const DB_DIR_KEY_PREFIX: u8 = 0x64; + +/// The key prefix of rel directory keys. +pub const REL_DIR_KEY_PREFIX: u8 = 0x65; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub enum RelDirExists { + Exists, + Removed, +} + +#[derive(Debug)] +pub struct DecodeError; + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid marker") + } +} + +impl std::error::Error for DecodeError {} + +impl RelDirExists { + /// The value of the rel directory keys that indicates the existence of a relation. + const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r"); + + pub fn encode(&self) -> Bytes { + match self { + Self::Exists => Self::REL_EXISTS_MARKER.clone(), + Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(), + } + } + + pub fn decode_option(data: Option>) -> Result { + match data { + Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists), + // Any other marker is invalid + Some(_) => Err(DecodeError), + None => Ok(Self::Removed), + } + } + + pub fn decode(data: impl AsRef<[u8]>) -> Result { + let data = data.as_ref(); + if data == Self::REL_EXISTS_MARKER { + Ok(Self::Exists) + } else if data == SPARSE_TOMBSTONE_MARKER { + Ok(Self::Removed) + } else { + Err(DecodeError) + } + } +} + +/// A tombstone in the sparse keyspace, which is an empty buffer. +pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b""); + /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX @@ -110,6 +170,24 @@ impl Key { } } + pub fn rel_dir_sparse_key_range() -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// This function checks more extensively what keys we can take on the write path. /// If a key beginning with 00 does not have a global/default tablespace OID, it /// will be rejected on the write path. @@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } +#[inline(always)] +pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: relnode, + field5: forknum, + field6: 1, + } +} + +pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + } // it's fine to exclude the last key b/c we only use field6 == 1 +} + #[inline(always)] pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { @@ -734,9 +842,9 @@ impl Key { self.field1 == RELATION_SIZE_PREFIX } - pub fn sparse_non_inherited_keyspace() -> Range { + pub const fn sparse_non_inherited_keyspace() -> Range { // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace - debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX); + const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX); Key { field1: AUX_KEY_PREFIX, field2: 0, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 6dbfbec345..dd7bea2916 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1080,8 +1080,7 @@ pub struct TenantInfo { /// Opaque explanation if gc is being blocked. /// - /// Only looked up for the individual tenant detail, not the listing. This is purely for - /// debugging, not included in openapi. + /// Only looked up for the individual tenant detail, not the listing. #[serde(skip_serializing_if = "Option::is_none")] pub gc_blocking: Option, } @@ -1136,7 +1135,26 @@ pub struct TimelineInfo { pub ancestor_lsn: Option, pub last_record_lsn: Lsn, pub prev_record_lsn: Option, + + /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. + /// TODO: remove once control plane no longer reads it. pub latest_gc_cutoff_lsn: Lsn, + + /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. + /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, + /// as it is easier to reason about. + #[serde(default)] + pub applied_gc_cutoff_lsn: Lsn, + + /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval. + /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest + /// LSN at which it is legal to create a branch or ephemeral endpoint. + /// + /// Note that holders of valid LSN leases may be able to create branches and read pages earlier + /// than this LSN, but new leases may not be taken out earlier than this LSN. + #[serde(default)] + pub min_readable_lsn: Lsn, + pub disk_consistent_lsn: Lsn, /// The LSN that we have succesfully uploaded to remote storage diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 8c024375c1..f74b229ac4 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -9,6 +9,8 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; +use std::os::fd::AsRawFd; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Poll}; @@ -268,6 +270,7 @@ impl MaybeWriteOnly { } pub struct PostgresBackend { + pub socket_fd: RawFd, framed: MaybeWriteOnly, pub state: ProtoState, @@ -293,9 +296,11 @@ impl PostgresBackend { tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; + let socket_fd = socket.as_raw_fd(); let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, @@ -307,6 +312,7 @@ impl PostgresBackend { impl PostgresBackend { pub fn new_from_io( + socket_fd: RawFd, socket: IO, peer_addr: SocketAddr, auth_type: AuthType, @@ -315,6 +321,7 @@ impl PostgresBackend { let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index ade0ffc9f6..161c6b8309 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -5,18 +5,15 @@ edition = "2021" license = "MIT/Apache-2.0" [dependencies] -async-trait.workspace = true bytes.workspace = true -byteorder.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } log = "0.4" parking_lot.workspace = true -percent-encoding = "2.0" pin-project-lite.workspace = true phf = "0.11" postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } -serde = { workspace = true, features = ["derive"] } \ No newline at end of file +serde = { workspace = true, features = ["derive"] } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 9bbbd4c260..46151ab924 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, ToSql, Type}; use crate::{ - prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, + query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, + SimpleQueryMessage, Statement, Transaction, TransactionBuilder, }; use bytes::BytesMut; use fallible_iterator::FallibleIterator; @@ -54,18 +54,18 @@ impl Responses { } /// A cache of type info and prepared statements for fetching type info -/// (corresponding to the queries in the [prepare] module). +/// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] struct CachedTypeInfo { /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). typeinfo: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). typeinfo_composite: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or /// its fallback). typeinfo_enum: Option, @@ -190,26 +190,6 @@ impl Client { &self.inner } - /// Creates a new prepared statement. - /// - /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), - /// which are set when executed. Prepared statements can only be used with the connection that created them. - pub async fn prepare(&self, query: &str) -> Result { - self.prepare_typed(query, &[]).await - } - - /// Like `prepare`, but allows the types of query parameters to be explicitly specified. - /// - /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be - /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. - pub async fn prepare_typed( - &self, - query: &str, - parameter_types: &[Type], - ) -> Result { - prepare::prepare(&self.inner, query, parameter_types).await - } - /// Executes a statement, returning a vector of the resulting rows. /// /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list @@ -222,14 +202,11 @@ impl Client { /// # Panics /// /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( + pub async fn query( &self, - statement: &T, + statement: Statement, params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> - where - T: ?Sized + ToStatement, - { + ) -> Result, Error> { self.query_raw(statement, slice_iter(params)) .await? .try_collect() @@ -250,13 +227,15 @@ impl Client { /// Panics if the number of parameters provided does not match the number expected. /// /// [`query`]: #method.query - pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + pub async fn query_raw<'a, I>( + &self, + statement: Statement, + params: I, + ) -> Result where - T: ?Sized + ToStatement, I: IntoIterator, I::IntoIter: ExactSizeIterator, { - let statement = statement.__convert().into_statement(self).await?; query::query(&self.inner, statement, params).await } @@ -271,55 +250,6 @@ impl Client { query::query_txt(&self.inner, statement, params).await } - /// Executes a statement, returning the number of rows modified. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn execute( - &self, - statement: &T, - params: &[&(dyn ToSql + Sync)], - ) -> Result - where - T: ?Sized + ToStatement, - { - self.execute_raw(statement, slice_iter(params)).await - } - - /// The maximally flexible version of [`execute`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`execute`]: #method.execute - pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result - where - T: ?Sized + ToStatement, - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - let statement = statement.__convert().into_statement(self).await?; - query::execute(self.inner(), statement, params).await - } - /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 768213f8ed..042b5a675e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,7 +1,8 @@ +#![allow(async_fn_in_trait)] + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use async_trait::async_trait; use postgres_protocol2::Oid; mod private { @@ -11,7 +12,6 @@ mod private { /// A trait allowing abstraction over connections and transactions. /// /// This trait is "sealed", and cannot be implemented outside of this crate. -#[async_trait] pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed { impl private::Sealed for Client {} -#[async_trait] impl GenericClient for Client { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where @@ -39,14 +38,12 @@ impl GenericClient for Client { /// Query for type information async fn get_type(&self, oid: Oid) -> Result { - self.get_type(oid).await + crate::prepare::get_type(self.inner(), oid).await } } impl private::Sealed for Transaction<'_> {} -#[async_trait] -#[allow(clippy::needless_lifetimes)] impl GenericClient for Transaction<'_> { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 9155dd8279..7426279167 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; -pub use crate::to_statement::ToStatement; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; @@ -65,7 +64,6 @@ pub mod row; mod simple_query; mod statement; pub mod tls; -mod to_statement; mod transaction; mod transaction_builder; pub mod types; diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index da0c755c5b..58bbb26cbc 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,7 +1,6 @@ use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::error::SqlState; use crate::types::{Field, Kind, Oid, Type}; use crate::{query, slice_iter}; use crate::{Column, Error, Statement}; @@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use std::future::Future; use std::pin::Pin; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -// Range types weren't added until Postgres 9.2, so pg_range may not exist -const TYPEINFO_FALLBACK_QUERY: &str = "\ -SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid -FROM pg_catalog.pg_type t -INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid -WHERE t.oid = $1 -"; - const TYPEINFO_ENUM_QUERY: &str = "\ SELECT enumlabel FROM pg_catalog.pg_enum @@ -39,14 +29,6 @@ WHERE enumtypid = $1 ORDER BY enumsortorder "; -// Postgres 9.0 didn't have enumsortorder -const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY oid -"; - pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ SELECT attname, atttypid FROM pg_catalog.pg_attribute @@ -56,15 +38,13 @@ AND attnum > 0 ORDER BY attnum "; -static NEXT_ID: AtomicUsize = AtomicUsize::new(0); - pub async fn prepare( client: &Arc, + name: &'static str, query: &str, types: &[Type], ) -> Result { - let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); - let buf = encode(client, &name, query, types)?; + let buf = encode(client, name, query, types)?; let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; match responses.next().await? { @@ -105,10 +85,11 @@ pub async fn prepare( fn prepare_rec<'a>( client: &'a Arc, + name: &'static str, query: &'a str, types: &'a [Type], ) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, query, types)) + Box::pin(prepare(client, name, query, types)) } fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { @@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { - prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; client.set_typeinfo(&stmt); Ok(stmt) @@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { - prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo_enum"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; client.set_typeinfo_enum(&stmt); Ok(stmt) @@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc) -> Result( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if log_enabled!(Level::Debug) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let mut responses = start(client, buf).await?; - - let mut rows = 0; - loop { - match responses.next().await? { - Message::DataRow(_) => {} - Message::CommandComplete(body) => { - rows = body - .tag() - .map_err(Error::parse)? - .rsplit(' ') - .next() - .unwrap() - .parse() - .unwrap_or(0); - } - Message::EmptyQueryResponse => rows = 0, - Message::ReadyForQuery(_) => return Ok(rows), - _ => return Err(Error::unexpected_message()), - } - } -} - async fn start(client: &InnerClient, buf: Bytes) -> Result { let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 22e160fc05..591872fbc5 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -13,7 +13,7 @@ use std::{ struct StatementInner { client: Weak, - name: String, + name: &'static str, params: Vec, columns: Vec, } @@ -22,7 +22,7 @@ impl Drop for StatementInner { fn drop(&mut self) { if let Some(client) = self.client.upgrade() { let buf = client.with_buf(|buf| { - frontend::close(b'S', &self.name, buf).unwrap(); + frontend::close(b'S', self.name, buf).unwrap(); frontend::sync(buf); buf.split().freeze() }); @@ -40,7 +40,7 @@ pub struct Statement(Arc); impl Statement { pub(crate) fn new( inner: &Arc, - name: String, + name: &'static str, params: Vec, columns: Vec, ) -> Statement { @@ -55,14 +55,14 @@ impl Statement { pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { Statement(Arc::new(StatementInner { client: Weak::new(), - name: String::new(), + name: "", params, columns, })) } pub(crate) fn name(&self) -> &str { - &self.0.name + self.0.name } /// Returns the expected types of the statement's parameters. diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs deleted file mode 100644 index 7e12992728..0000000000 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ /dev/null @@ -1,57 +0,0 @@ -use crate::to_statement::private::{Sealed, ToStatementType}; -use crate::Statement; - -mod private { - use crate::{Client, Error, Statement}; - - pub trait Sealed {} - - pub enum ToStatementType<'a> { - Statement(&'a Statement), - Query(&'a str), - } - - impl ToStatementType<'_> { - pub async fn into_statement(self, client: &Client) -> Result { - match self { - ToStatementType::Statement(s) => Ok(s.clone()), - ToStatementType::Query(s) => client.prepare(s).await, - } - } - } -} - -/// A trait abstracting over prepared and unprepared statements. -/// -/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which -/// was prepared previously. -/// -/// This trait is "sealed" and cannot be implemented by anything outside this crate. -pub trait ToStatement: Sealed { - #[doc(hidden)] - fn __convert(&self) -> ToStatementType<'_>; -} - -impl ToStatement for Statement { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Statement(self) - } -} - -impl Sealed for Statement {} - -impl ToStatement for str { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for str {} - -impl ToStatement for String { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for String {} diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index a39fda526f..8b14a4f290 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -9,13 +9,43 @@ use anyhow::bail; use serde::{Deserialize, Serialize}; use utils::id::NodeId; -/// Number uniquely identifying safekeeper configuration. -/// Note: it is a part of sk control file. -pub type Generation = u32; /// 1 is the first valid generation, 0 is used as /// a placeholder before we fully migrate to generations. -pub const INVALID_GENERATION: Generation = 0; -pub const INITIAL_GENERATION: Generation = 1; +pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0); +pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1); + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +/// +/// Like tenant generations, but for safekeepers. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct SafekeeperGeneration(u32); + +impl SafekeeperGeneration { + pub const fn new(v: u32) -> Self { + Self(v) + } + + #[track_caller] + pub fn previous(&self) -> Option { + Some(Self(self.0.checked_sub(1)?)) + } + + #[track_caller] + pub fn next(&self) -> Self { + Self(self.0 + 1) + } + + pub fn into_inner(self) -> u32 { + self.0 + } +} + +impl Display for SafekeeperGeneration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} /// Membership is defined by ids so e.g. walproposer uses them to figure out /// quorums, but we also carry host and port to give wp idea where to connect. @@ -89,7 +119,7 @@ impl Display for MemberSet { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Configuration { /// Unique id. - pub generation: Generation, + pub generation: SafekeeperGeneration, /// Current members of the configuration. pub members: MemberSet, /// Some means it is a joint conf. diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 30418b0efd..41ccdaa428 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse { pub struct SafekeeperUtilization { pub timeline_count: u64, } + +/// pull_timeline request body. +#[derive(Debug, Deserialize, Serialize)] +pub struct PullTimelineRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PullTimelineResponse { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 0f10300959..62e0f4cfba 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -24,11 +24,10 @@ diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -inferno.workspace = true fail.workspace = true futures = { workspace = true } jsonwebtoken.workspace = true -nix.workspace = true +nix = {workspace = true, features = [ "ioctl" ] } once_cell.workspace = true pin-project-lite.workspace = true regex.workspace = true diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 42b45eeea0..4d173d0726 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -286,6 +286,11 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] + struct NewTypeStruct(u32); + const NT1: NewTypeStruct = NewTypeStruct(414243); + const NT1_INNER: u32 = 414243; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, @@ -408,4 +413,42 @@ mod tests { let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn be_nt() { + use super::BeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("0006 5223"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn le_nt() { + use super::LeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("2352 0600"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 820ff2d5ea..9389a27bf3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -93,6 +93,9 @@ pub mod try_rcu; pub mod guard_arc_swap; +#[cfg(target_os = "linux")] +pub mod linux_socket_ioctl; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs new file mode 100644 index 0000000000..5ae0e86af8 --- /dev/null +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -0,0 +1,35 @@ +//! Linux-specific socket ioctls. +//! +//! + +use std::{ + io, + mem::MaybeUninit, + os::{fd::RawFd, raw::c_int}, +}; + +use nix::libc::{FIONREAD, TIOCOUTQ}; + +unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { + let mut inq: MaybeUninit = MaybeUninit::uninit(); + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn inq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, FIONREAD) +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn outq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, TIOCOUTQ) +} diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 6352ea9f92..d98284f969 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -117,6 +117,10 @@ impl TenantShardId { ) } + pub fn range(&self) -> RangeInclusive { + RangeInclusive::new(*self, *self) + } + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index da7ec5abce..bb0f64ca32 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -477,6 +477,26 @@ impl Client { self.request(Method::POST, &uri, ()).await.map(|_| ()) } + pub async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + )) + .expect("Cannot build URL"); + + if let Some(concurrency) = concurrency { + path.query_pairs_mut() + .append_pair("concurrency", &format!("{}", concurrency)); + } + + self.request(Method::POST, path, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a6087920fd..e03b1bbe96 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,7 +13,7 @@ use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::Key; +use pageserver_api::key::{rel_block_to_key, Key}; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; @@ -42,8 +42,8 @@ use utils::lsn::Lsn; pub enum BasebackupError { #[error("basebackup pageserver error {0:#}")] Server(#[from] anyhow::Error), - #[error("basebackup client error {0:#}")] - Client(#[source] io::Error), + #[error("basebackup client error {0:#} when {1}")] + Client(#[source] io::Error, &'static str), } /// Create basebackup with non-rel data in it. @@ -234,7 +234,7 @@ where self.ar .append(&header, self.buf.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "flush"))?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); @@ -273,9 +273,9 @@ where for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add directory to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball"))?; } // Send config files. @@ -286,13 +286,13 @@ where self.ar .append(&header, data) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?; } else { let header = new_tar_header(filepath, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?; } } if !lazy_slru_download { @@ -406,7 +406,7 @@ where self.ar .append(&header, &*content) .await - .context("could not add aux file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?; } if min_restart_lsn != Lsn::MAX { @@ -419,7 +419,7 @@ where self.ar .append(&header, &data[..]) .await - .context("could not add restart.lsn file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?; } for xid in self .timeline @@ -451,9 +451,9 @@ where let crc32 = crc32c::crc32c(&content); content.extend_from_slice(&crc32.to_le_bytes()); let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; - self.ar.append(&header, &*content).await.context( - "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", - )?; + self.ar.append(&header, &*content).await.map_err(|e| { + BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint") + })?; } fail_point!("basebackup-before-control-file", |_| { @@ -464,7 +464,10 @@ where // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await.map_err(BasebackupError::Client)?; + self.ar + .finish() + .await + .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?; debug!("all tarred up!"); Ok(()) } @@ -482,9 +485,9 @@ where let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?; return Ok(()); } @@ -498,13 +501,9 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn( - src, - blknum, - Version::Lsn(self.lsn), - self.ctx, - self.io_concurrency.clone(), - ) + // TODO: investigate using get_vectored for the entire startblk..endblk range. + // But this code path is not on the critical path for most basebackups (?). + .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); @@ -515,7 +514,7 @@ where self.ar .append(&header, segment_data.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?; seg += 1; startblk = endblk; @@ -566,7 +565,7 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?; info!("timeline.pg_version {}", self.timeline.pg_version); @@ -576,7 +575,7 @@ where self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?; } else { warn!("global/pg_filenode.map is missing"); } @@ -612,9 +611,9 @@ where let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -627,14 +626,14 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?; } }; Ok(()) @@ -663,7 +662,7 @@ where self.ar .append(&header, &buf[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?; Ok(()) } @@ -693,7 +692,7 @@ where zenith_signal.as_bytes(), ) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; let checkpoint_bytes = self .timeline @@ -718,7 +717,7 @@ where self.ar .append(&header, &pg_control_bytes[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -742,7 +741,7 @@ where self.ar .append(&header, &wal_seg[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?; Ok(()) } } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 8f2177fe5b..da9c095a15 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -98,6 +98,7 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + read_path_debug: bool, } /// The kind of access to the page cache. @@ -155,6 +156,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + read_path_debug: false, }, } } @@ -168,6 +170,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + read_path_debug: original.read_path_debug, }, } } @@ -191,6 +194,11 @@ impl RequestContextBuilder { self } + pub(crate) fn read_path_debug(mut self, b: bool) -> Self { + self.inner.read_path_debug = b; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -291,4 +299,8 @@ impl RequestContext { pub(crate) fn page_content_kind(&self) -> PageContentKind { self.page_content_kind } + + pub(crate) fn read_path_debug(&self) -> bool { + self.read_path_debug + } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4b976e7f6f..12252739fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -824,6 +824,38 @@ paths: schema: $ref: "#/components/schemas/TenantConfigResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: concurrency + description: Maximum number of concurrent downloads (capped at remote storage concurrency) + in: query + required: false + schema: + type: integer + post: + description: | + Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter + may be used to target all shards of a tenant when the unsharded form is used, or a specific + tenant shard with the sharded form. + responses: + "200": + description: Success + delete: + description: Stop any on-going background downloads of heatmap layers for the specified timeline. + responses: + "200": + description: Success + /v1/utilization: get: description: | @@ -882,6 +914,8 @@ components: properties: reason: type: string + gc_blocking: + type: string TenantCreateRequest: allOf: @@ -1080,9 +1114,15 @@ components: type: integer state: type: string + min_readable_lsn: + type: string + format: hex latest_gc_cutoff_lsn: type: string format: hex + applied_gc_cutoff_lsn: + type: string + format: hex SyntheticSizeResponse: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bd196621c1..56a84a98a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use crate::config::PageServerConf; +use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -482,6 +483,11 @@ async fn build_timeline_info_common( let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let min_readable_lsn = std::cmp::max( + timeline.get_gc_cutoff_lsn(), + *timeline.get_applied_gc_cutoff_lsn(), + ); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -493,7 +499,12 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), + // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally + // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we + // actually trimmed data to), which can pass each other when PITR is changed. + latest_gc_cutoff_lsn: min_readable_lsn, + min_readable_lsn, + applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), current_logical_size_is_accurate: match current_logical_size.accuracy() { tenant::timeline::logical_size::Accuracy::Approximate => false, @@ -1453,6 +1464,59 @@ async fn timeline_layer_scan_disposable_keys( ) } +async fn timeline_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // Only used in the case where remote storage is not configured. + const DEFAULT_MAX_CONCURRENCY: usize = 100; + // A conservative default. + const DEFAULT_CONCURRENCY: usize = 16; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let desired_concurrency = + parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let max_concurrency = get_config(&request) + .remote_storage_config + .as_ref() + .map(|c| c.concurrency_limit()) + .unwrap_or(DEFAULT_MAX_CONCURRENCY); + let concurrency = std::cmp::min(max_concurrency, desired_concurrency); + + timeline.start_heatmap_layers_download(concurrency).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn timeline_shutdown_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + timeline.stop_and_drain_heatmap_layers_download().await; + + json_response(StatusCode::OK, ()) +} + async fn layer_download_handler( request: Request, _cancel: CancellationToken, @@ -2331,6 +2395,7 @@ async fn timeline_checkpoint_handler( match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), + CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e) } )?; @@ -2508,14 +2573,30 @@ async fn deletion_queue_flush( } } -/// Try if `GetPage@Lsn` is successful, useful for manual debugging. async fn getpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(false, request, cancel).await +} + +async fn touchpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(true, request, cancel).await +} + +/// Try if `GetPage@Lsn` is successful, useful for manual debugging. +async fn getpage_at_lsn_handler_inner( + touch: bool, request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_shard_id.tenant_id))?; + // Require pageserver admin permission for this API instead of only tenant-level token. + check_permission(&request, None)?; let state = get_state(&request); struct Key(pageserver_api::key::Key); @@ -2530,22 +2611,29 @@ async fn getpage_at_lsn_handler( let key: Key = parse_query_param(&request, "key")? .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?; - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn: Option = parse_query_param(&request, "lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + // Enable read path debugging + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + // Use last_record_lsn if no lsn is provided + let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let page = timeline.get(key.0, lsn, &ctx).await?; - Result::<_, ApiError>::Ok( - Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/octet-stream") - .body(hyper::Body::from(page)) - .unwrap(), - ) + if touch { + json_response(StatusCode::OK, ()) + } else { + Result::<_, ApiError>::Ok( + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(page)) + .unwrap(), + ) + } } .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await @@ -3616,6 +3704,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", |r| api_handler(r, layer_map_info_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_download_heatmap_layers_handler), + ) + .delete( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), @@ -3672,6 +3768,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", + |r| api_handler(r, touchpage_at_lsn_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| api_handler(r, timeline_collect_keyspace), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 983a3079e4..e1c26b0684 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::num::NonZeroUsize; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; @@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", &["tenant_id", "shard_id", "timeline_id"], // Low resolution to reduce cardinality. - vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0], + vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); @@ -1439,27 +1440,66 @@ impl Drop for SmgrOpTimer { } impl SmgrOpFlushInProgress { - pub(crate) async fn measure(self, mut started_at: Instant, mut fut: Fut) -> O + /// The caller must guarantee that `socket_fd`` outlives this function. + pub(crate) async fn measure( + self, + started_at: Instant, + mut fut: Fut, + socket_fd: RawFd, + ) -> O where Fut: std::future::Future, { let mut fut = std::pin::pin!(fut); - // Whenever observe_guard gets called, or dropped, - // it adds the time elapsed since its last call to metrics. - // Last call is tracked in `now`. + let mut logged = false; + let mut last_counter_increment_at = started_at; let mut observe_guard = scopeguard::guard( - || { + |is_timeout| { let now = Instant::now(); - let elapsed = now - started_at; - self.global_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.per_timeline_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - started_at = now; + + // Increment counter + { + let elapsed_since_last_observe = now - last_counter_increment_at; + self.global_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + self.per_timeline_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + last_counter_increment_at = now; + } + + // Log something on every timeout, and on completion but only if we hit a timeout. + if is_timeout || logged { + logged = true; + let elapsed_total = now - started_at; + let msg = if is_timeout { + "slow flush ongoing" + } else { + "slow flush completed or cancelled" + }; + + let (inq, outq) = { + // SAFETY: caller guarantees that `socket_fd` outlives this function. + #[cfg(target_os = "linux")] + unsafe { + ( + utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2), + utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2), + ) + } + #[cfg(not(target_os = "linux"))] + { + _ = socket_fd; // appease unused lint on macOS + (-1, -1) + } + }; + + let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64()); + tracing::info!(elapsed_total_secs, inq, outq, msg); + } }, |mut observe| { - observe(); + observe(false); }, ); @@ -1467,7 +1507,7 @@ impl SmgrOpFlushInProgress { match tokio::time::timeout(Duration::from_secs(10), &mut fut).await { Ok(v) => return v, Err(_timeout) => { - (*observe_guard)(); + (*observe_guard)(true); } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 972dad34d4..0c8da6f2a8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -236,7 +237,7 @@ pub async fn libpq_listener_main( type ConnectionHandlerResult = anyhow::Result<()>; -#[instrument(skip_all, fields(peer_addr))] +#[instrument(skip_all, fields(peer_addr, application_name))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( conf: &'static PageServerConf, @@ -257,6 +258,8 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let socket_fd = socket.as_raw_fd(); + let peer_addr = socket.peer_addr().context("get peer address")?; tracing::Span::current().record("peer_addr", field::display(peer_addr)); @@ -305,7 +308,7 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -914,7 +917,7 @@ impl PageServerHandler { &shard, req.hdr.request_lsn, req.hdr.not_modified_since, - &shard.get_latest_gc_cutoff_lsn(), + &shard.get_applied_gc_cutoff_lsn(), ctx, ) // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait @@ -1286,12 +1289,15 @@ impl PageServerHandler { ))?; // what we want to do + let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { - Some(flushing_timer) => { - futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut)) - } + Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure( + Instant::now(), + flush_fut, + socket_fd, + )), None => futures::future::Either::Right(flush_fut), }; // do it while respecting cancellation @@ -1793,6 +1799,13 @@ impl PageServerHandler { .as_millis() .to_string() }); + + info!( + "acquired lease for {} until {}", + lsn, + valid_until_str.as_deref().unwrap_or("") + ); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( @@ -1810,7 +1823,7 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1837,7 +1850,7 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1864,7 +1877,7 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1954,7 +1967,7 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -2050,7 +2063,8 @@ impl PageServerHandler { { fn map_basebackup_error(err: BasebackupError) -> QueryError { match err { - BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + // TODO: passthrough the error site to the final error message? + BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), } } @@ -2071,7 +2085,7 @@ impl PageServerHandler { //return Err(QueryError::NotFound("timeline is archived".into())) } - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); @@ -2151,10 +2165,12 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } - writer - .flush() - .await - .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; + writer.flush().await.map_err(|e| { + map_basebackup_error(BasebackupError::Client( + e, + "handle_basebackup_request,flush", + )) + })?; } pgb.write_message_noflush(&BeMessage::CopyDone) @@ -2454,9 +2470,16 @@ where fn startup( &mut self, _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, + sm: &FeStartupPacket, ) -> Result<(), QueryError> { fail::fail_point!("ps::connection-start::startup-packet"); + + if let FeStartupPacket::StartupMessage { params, .. } = sm { + if let Some(app_name) = params.get("application_name") { + Span::current().record("application_name", field::display(app_name)); + } + }; + Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 00f332d797..ae2762bd1e 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -23,13 +23,14 @@ use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; -use pageserver_api::key::Key; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, - slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, + slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, + twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, + CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::key::{rel_tag_sparse_key, Key}; use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; @@ -490,12 +491,33 @@ impl Timeline { if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } - // fetch directory listing + + // Read path: first read the new reldir keyspace. Early return if the relation exists. + // Otherwise, read the old reldir keyspace. + // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. + + if self.get_rel_size_v2_enabled() { + // fetch directory listing (new) + let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); + let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + let exists_v2 = buf == RelDirExists::Exists; + // Fast path: if the relation exists in the new format, return true. + // TODO: we should have a verification mode that checks both keyspaces + // to ensure the relation only exists in one of them. + if exists_v2 { + return Ok(true); + } + } + + // fetch directory listing (old) + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) + let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum)); + Ok(exists_v1) } /// Get a list of all existing relations in given tablespace and database. @@ -513,12 +535,12 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - // fetch directory listing + // fetch directory listing (old) let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - let rels: HashSet = + let rels_v1: HashSet = HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { spcnode, dbnode, @@ -526,6 +548,46 @@ impl Timeline { forknum: *forknum, })); + if !self.get_rel_size_v2_enabled() { + return Ok(rels_v1); + } + + // scan directory listing (new), merge with the old results + let key_range = rel_tag_sparse_key_range(spcnode, dbnode); + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + let results = self + .scan( + KeySpace::single(key_range), + version.get_lsn(), + ctx, + io_concurrency, + ) + .await?; + let mut rels = rels_v1; + for (key, val) in results { + let val = RelDirExists::decode(&val?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + assert_eq!(key.field6, 1); + assert_eq!(key.field2, spcnode); + assert_eq!(key.field3, dbnode); + let tag = RelTag { + spcnode, + dbnode, + relnode: key.field4, + forknum: key.field5, + }; + if val == RelDirExists::Removed { + debug_assert!(!rels.contains(&tag), "removed reltag in v2"); + continue; + } + let did_not_contain = rels.insert(tag); + debug_assert!(did_not_contain, "duplicate reltag in v2"); + } Ok(rels) } @@ -611,7 +673,7 @@ impl Timeline { ) -> Result { pausable_failpoint!("find-lsn-for-timestamp-pausable"); - let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); + let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); let gc_cutoff_planned = { let gc_info = self.gc_info.read().unwrap(); gc_info.min_cutoff() @@ -1144,7 +1206,11 @@ impl Timeline { let dense_keyspace = result.to_keyspace(); let sparse_keyspace = SparseKeySpace(KeySpace { - ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + ranges: vec![ + Key::metadata_aux_key_range(), + repl_origin_key_range(), + Key::rel_dir_sparse_key_range(), + ], }); if cfg!(debug_assertions) { @@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> { /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. - pending_directory_entries: Vec<(DirectoryKind, usize)>, + pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>, /// An **approximation** of how many metadata bytes will be written to the EphemeralFile. pending_metadata_bytes: usize, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MetricsUpdate { + /// Set the metrics to this value + Set(u64), + /// Increment the metrics by this value + Add(u64), + /// Decrement the metrics by this value + Sub(u64), +} + impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we @@ -1359,7 +1435,8 @@ impl DatadirModification<'_> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Db, 0)); + self.pending_directory_entries + .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); let buf = if self.tline.pg_version >= 17 { @@ -1372,7 +1449,7 @@ impl DatadirModification<'_> { }) }?; self.pending_directory_entries - .push((DirectoryKind::TwoPhase, 0)); + .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0))); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); @@ -1382,17 +1459,23 @@ impl DatadirModification<'_> { // harmless but they'd just be dropped on later compaction. if self.tline.tenant_shard_id.is_shard_zero() { self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), + MetricsUpdate::Set(0), + )); } Ok(()) @@ -1658,10 +1741,16 @@ impl DatadirModification<'_> { } if r.is_none() { // Create RelDirectory + // TODO: if we have fully migrated to v2, no need to create this directory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Rel, 0)); + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + if self.tline.get_rel_size_v2_enabled() { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + } self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1685,8 +1774,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid = xid as u32; @@ -1694,8 +1785,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -1744,8 +1837,10 @@ impl DatadirModification<'_> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; - self.pending_directory_entries - .push((DirectoryKind::Db, dir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1778,39 +1873,85 @@ impl DatadirModification<'_> { // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; - let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = + + let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dbdir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); - - // and create the RelDirectory - RelDirectory::default() + false } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + true }; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if !dbdir_exists { + // Create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; + // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } - self.pending_directory_entries - .push((DirectoryKind::Rel, rel_dir.rels.len())); - - self.put( - rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), - ); - + if self.tline.get_rel_size_v2_enabled() { + let sparse_rel_dir_key = + rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); + // check if the rel_dir_key exists in v2 + let val = self + .sparse_get(sparse_rel_dir_key, ctx) + .await + .map_err(|e| RelationError::Other(e.into()))?; + let val = RelDirExists::decode_option(val) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + return Err(RelationError::AlreadyExists); + } + self.put( + sparse_rel_dir_key, + Value::Image(RelDirExists::Exists.encode()), + ); + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation. + // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there + // will be key not found errors if we don't create an empty one for rel_size_v2. + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&RelDirectory::default()).context("serialize")?, + )), + ); + } + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); + } else { + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) + } + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&rel_dir).context("serialize")?, + )), + ); + } // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -1896,9 +2037,34 @@ impl DatadirModification<'_> { let mut dirty = false; for rel_tag in rel_tags { - if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; + true + } else if self.tline.get_rel_size_v2_enabled() { + // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. + // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion + // logic). + let key = + rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); + let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); + // put tombstone + self.put(key, Value::Image(RelDirExists::Removed.encode())); + // no need to set dirty to true + true + } else { + false + } + } else { + false + }; + if found { // update logical size let size_key = rel_size_to_key(rel_tag); let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1914,8 +2080,6 @@ impl DatadirModification<'_> { if dirty { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); - self.pending_directory_entries - .push((DirectoryKind::Rel, dir.rels.len())); } } @@ -1939,8 +2103,10 @@ impl DatadirModification<'_> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1987,8 +2153,10 @@ impl DatadirModification<'_> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -2020,8 +2188,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid: u32 = u32::try_from(xid)?; @@ -2030,8 +2200,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -2147,7 +2319,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } Ok(()) @@ -2233,7 +2405,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } self.pending_metadata_bytes = 0; @@ -2297,6 +2469,22 @@ impl DatadirModification<'_> { self.tline.get(key, lsn, ctx).await } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2379,6 +2567,23 @@ impl Version<'_> { } } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(timeline, key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn get_lsn(&self) -> Lsn { match self { Version::Lsn(lsn) => *lsn, @@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind { Rel, AuxFiles, SlruSegment(SlruKind), + RelV2, } impl DirectoryKind { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4c65991e45..efb35625f2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -40,6 +40,8 @@ use remote_timeline_client::manifest::{ use remote_timeline_client::UploadQueueNotReadyError; use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; +use secondary::heatmap::HeatMapTenant; +use secondary::heatmap::HeatMapTimeline; use std::collections::BTreeMap; use std::fmt; use std::future::Future; @@ -55,6 +57,7 @@ use timeline::offload::OffloadError; use timeline::CompactFlags; use timeline::CompactOptions; use timeline::CompactionError; +use timeline::PreviousHeatmap; use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; @@ -262,6 +265,7 @@ struct TimelinePreload { timeline_id: TimelineId, client: RemoteTimelineClient, index_part: Result, + previous_heatmap: Option, } pub(crate) struct TenantPreload { @@ -1128,6 +1132,7 @@ impl Tenant { resources: TimelineResources, mut index_part: IndexPart, metadata: TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1158,6 +1163,7 @@ impl Tenant { let timeline = self.create_timeline_struct( timeline_id, &metadata, + previous_heatmap, ancestor.clone(), resources, CreateTimelineCause::Load, @@ -1557,8 +1563,18 @@ impl Tenant { } } + // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even + // pulled the first heatmap. Not entirely necessary since the storage controller + // will kick the secondary in any case and cause a download. + let maybe_heatmap_at = self.read_on_disk_heatmap().await; + let timelines = self - .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) + .load_timelines_metadata( + remote_timeline_ids, + remote_storage, + maybe_heatmap_at, + cancel, + ) .await?; Ok(TenantPreload { @@ -1571,6 +1587,26 @@ impl Tenant { }) } + async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { + let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); + match tokio::fs::read_to_string(on_disk_heatmap_path).await { + Ok(heatmap) => match serde_json::from_str::(&heatmap) { + Ok(heatmap) => Some((heatmap, std::time::Instant::now())), + Err(err) => { + error!("Failed to deserialize old heatmap: {err}"); + None + } + }, + Err(err) => match err.kind() { + std::io::ErrorKind::NotFound => None, + _ => { + error!("Unexpected IO error reading old heatmap: {err}"); + None + } + }, + } + } + /// /// Background task that downloads all data for a tenant and brings it to Active state. /// @@ -1658,7 +1694,10 @@ impl Tenant { match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); - remote_index_and_client.insert(timeline_id, (index_part, preload.client)); + remote_index_and_client.insert( + timeline_id, + (index_part, preload.client, preload.previous_heatmap), + ); } MaybeDeletedIndexPart::Deleted(index_part) => { info!( @@ -1677,7 +1716,7 @@ impl Tenant { // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?; for (timeline_id, remote_metadata) in sorted_timelines { - let (index_part, remote_client) = remote_index_and_client + let (index_part, remote_client, previous_heatmap) = remote_index_and_client .remove(&timeline_id) .expect("just put it in above"); @@ -1697,6 +1736,7 @@ impl Tenant { timeline_id, index_part, remote_metadata, + previous_heatmap, self.get_timeline_resources_for(remote_client), LoadTimelineCause::Attach, ctx, @@ -1846,11 +1886,13 @@ impl Tenant { } #[instrument(skip_all, fields(timeline_id=%timeline_id))] + #[allow(clippy::too_many_arguments)] async fn load_remote_timeline( self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, + previous_heatmap: Option, resources: TimelineResources, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1880,6 +1922,7 @@ impl Tenant { resources, index_part, remote_metadata, + previous_heatmap, ancestor, cause, ctx, @@ -1891,14 +1934,29 @@ impl Tenant { self: &Arc, timeline_ids: HashSet, remote_storage: &GenericRemoteStorage, + heatmap: Option<(HeatMapTenant, std::time::Instant)>, cancel: CancellationToken, ) -> anyhow::Result> { + let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1)); + let mut part_downloads = JoinSet::new(); for timeline_id in timeline_ids { let cancel_clone = cancel.clone(); + + let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| { + hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { + heatmap: h, + read_at: hs.1, + }) + }); part_downloads.spawn( - self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone) - .instrument(info_span!("download_index_part", %timeline_id)), + self.load_timeline_metadata( + timeline_id, + remote_storage.clone(), + previous_timeline_heatmap, + cancel_clone, + ) + .instrument(info_span!("download_index_part", %timeline_id)), ); } @@ -1946,6 +2004,7 @@ impl Tenant { self: &Arc, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, + previous_heatmap: Option, cancel: CancellationToken, ) -> impl Future { let client = self.build_timeline_client(timeline_id, remote_storage); @@ -1961,6 +2020,7 @@ impl Tenant { client, timeline_id, index_part, + previous_heatmap, } } } @@ -2072,7 +2132,12 @@ impl Tenant { })?; let timeline_preload = self - .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone()) + .load_timeline_metadata( + timeline_id, + self.remote_storage.clone(), + None, + cancel.clone(), + ) .await; let index_part = match timeline_preload.index_part { @@ -2106,6 +2171,7 @@ impl Tenant { timeline_id, index_part, remote_metadata, + None, timeline_resources, LoadTimelineCause::Unoffload, &ctx, @@ -2821,7 +2887,7 @@ impl Tenant { }; let metadata = index_part.metadata.clone(); self - .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ create_guard: timeline_create_guard, activate, }, &ctx) .await? .ready_to_activate() @@ -3035,6 +3101,9 @@ impl Tenant { if let Some(queue) = queue { outcome = queue .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) .await?; } } @@ -3081,6 +3150,12 @@ impl Tenant { // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} + CompactionError::CollectKeySpaceError(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() @@ -3858,6 +3933,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + pub fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -4030,6 +4112,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, @@ -4053,6 +4136,7 @@ impl Tenant { self.conf, Arc::clone(&self.tenant_conf), new_metadata, + previous_heatmap, ancestor, new_timeline_id, self.tenant_shard_id, @@ -4695,24 +4779,24 @@ impl Tenant { // We check it against both the planned GC cutoff stored in 'gc_info', // and the 'latest_gc_cutoff' of the last GC that was performed. The // planned GC cutoff in 'gc_info' is normally larger than - // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just + // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just // changed the GC settings for the tenant to make the PITR window // larger, but some of the data was already removed by an earlier GC // iteration. // check against last actual 'latest_gc_cutoff' first - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn(); { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { - tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn); + tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn); } else { src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, + *applied_gc_cutoff_lsn, )) .map_err(CreateTimelineError::AncestorLsn)?; @@ -4751,7 +4835,7 @@ impl Tenant { dst_prev, Some(src_id), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? + *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, ); @@ -5124,6 +5208,7 @@ impl Tenant { .create_timeline_struct( new_timeline_id, new_metadata, + None, ancestor, resources, CreateTimelineCause::Load, @@ -5571,7 +5656,7 @@ pub(crate) mod harness { lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, - rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled, + rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled), gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), gc_compaction_initial_threshold_kb: Some( tenant_conf.gc_compaction_initial_threshold_kb, @@ -6130,8 +6215,8 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn(); + assert!(*applied_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { Ok(_) => panic!("request for page should have failed"), Err(err) => assert!(err.to_string().contains("not found at")), @@ -7770,18 +7855,6 @@ mod tests { } tline.freeze_and_flush().await?; - // Force layers to L1 - tline - .compact( - &cancel, - { - let mut flags = EnumSet::new(); - flags.insert(CompactFlags::ForceL0Compaction); - flags - }, - &ctx, - ) - .await?; if iter % 5 == 0 { let (_, before_delta_file_accessed) = @@ -7794,7 +7867,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::ForceL0Compaction); flags }, &ctx, @@ -8241,8 +8313,6 @@ mod tests { let cancel = CancellationToken::new(); - // Image layer creation happens on the disk_consistent_lsn so we need to force set it now. - tline.force_set_disk_consistent_lsn(Lsn(0x40)); tline .compact( &cancel, @@ -8256,7 +8326,8 @@ mod tests { ) .await .unwrap(); - // Image layers are created at repartition LSN + + // Image layers are created at last_record_lsn let images = tline .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await @@ -8427,7 +8498,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -8535,7 +8606,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -8703,8 +8774,8 @@ mod tests { // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( - "latest_gc_cutoff_lsn: {}", - *timeline.get_latest_gc_cutoff_lsn() + "applied_gc_cutoff_lsn: {}", + *timeline.get_applied_gc_cutoff_lsn() ); timeline.force_set_disk_consistent_lsn(end_lsn); @@ -8730,7 +8801,7 @@ mod tests { // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff - assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); + assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn()); timeline .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) .expect_err("lease request on GC-ed LSN should fail"); @@ -8921,7 +8992,7 @@ mod tests { }; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9008,7 +9079,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -9461,7 +9532,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9608,7 +9679,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x38)) .wait() @@ -9709,7 +9780,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9960,7 +10031,7 @@ mod tests { { parent_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x10)) .wait() @@ -9980,7 +10051,7 @@ mod tests { { branch_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x50)) .wait() @@ -10336,7 +10407,7 @@ mod tests { { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -10721,7 +10792,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -10972,7 +11043,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7fdfd736ad..c6bcfdf2fb 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -485,7 +485,9 @@ impl TenantConfOpt { wal_receiver_protocol_override: self .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), - rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled), + rel_size_v2_enabled: self + .rel_size_v2_enabled + .unwrap_or(global_conf.rel_size_v2_enabled), gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index d281eb305f..15c6955260 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 { prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, + + // The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`]. latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, pg_version: u32, } diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 4a8e66d38a..0fa10ca294 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use std::{collections::HashMap, time::SystemTime}; use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; @@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; use utils::{generation::Generation, id::TimelineId}; #[derive(Serialize, Deserialize)] -pub(super) struct HeatMapTenant { +pub(crate) struct HeatMapTenant { /// Generation of the attached location that uploaded the heatmap: this is not required /// for correctness, but acts as a hint to secondary locations in order to detect thrashing /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. @@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant { pub(super) upload_period_ms: Option, } +impl HeatMapTenant { + pub(crate) fn into_timelines_index(self) -> HashMap { + self.timelines + .into_iter() + .map(|htl| (htl.timeline_id, htl)) + .collect() + } +} + #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, @@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline { } #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapLayer { pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] - pub(super) access_time: SystemTime, + pub(crate) access_time: SystemTime, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 6c3276ea3c..1e84a9d9dc 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -394,7 +394,7 @@ pub(super) async fn gather_inputs( ancestor_lsn, last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(), next_pitr_cutoff, retention_param_cutoff, lease_points, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 40282defd4..0bf606cf0a 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -136,6 +136,22 @@ pub(crate) fn local_layer_path( } } +pub(crate) enum LastEviction { + Never, + At(std::time::Instant), + Evicting, +} + +impl LastEviction { + pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool { + match self { + LastEviction::Never => false, + LastEviction::At(evicted_at) => evicted_at > &timepoint, + LastEviction::Evicting => true, + } + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( @@ -405,6 +421,17 @@ impl Layer { self.0.metadata() } + pub(crate) fn last_evicted_at(&self) -> LastEviction { + match self.0.last_evicted_at.try_lock() { + Ok(lock) => match *lock { + None => LastEviction::Never, + Some(at) => LastEviction::At(at), + }, + Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting, + Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"), + } + } + pub(crate) fn get_timeline_id(&self) -> Option { self.0 .timeline @@ -656,7 +683,9 @@ struct LayerInner { /// When the Layer was last evicted but has not been downloaded since. /// - /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. + /// This is used for skipping evicted layers from the previous heatmap (see + /// `[Timeline::generate_heatmap]`) and for updating metrics + /// (see [`LayerImplMetrics::redownload_after`]). last_evicted_at: std::sync::Mutex>, #[cfg(test)] diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 029444e973..5e63f59fd8 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -287,6 +287,7 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { + use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::tenant::upload_queue::NotInitialized; use crate::tenant::PageReconstructError; use CompactionError::*; @@ -294,6 +295,8 @@ fn log_compaction_error( let level = match err { ShuttingDown => return, Offload(_) => Level::ERROR, + CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index aa71ccbbab..48c208d5d7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +mod heatmap_layers_downloader; pub(crate) mod import_pgdata; mod init; pub mod layer_manager; @@ -21,6 +22,7 @@ use chrono::{DateTime, Utc}; use compaction::CompactionOutcome; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use futures::{stream::FuturesUnordered, StreamExt}; use handle::ShardTimelineId; use layer_manager::Shutdown; @@ -117,7 +119,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -150,16 +152,15 @@ use super::{ config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, MaybeOffloaded, }; -use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; +use super::{ + debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline, +}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{ remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, storage_layer::ReadableLayer, }; -use super::{ - secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, - GcError, -}; +use super::{secondary::heatmap::HeatMapLayer, GcError}; #[cfg(test)] use pageserver_api::value::Value; @@ -328,6 +329,7 @@ pub struct Timeline { // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM], directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], /// Ensures layers aren't frozen by checkpointer between @@ -352,8 +354,11 @@ pub struct Timeline { /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, - // Needed to ensure that we can't create a branch at a point that was already garbage collected - pub latest_gc_cutoff_lsn: Rcu, + // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which + // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it. + // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than + // the planned GC cutoff. + pub applied_gc_cutoff_lsn: Rcu, pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, @@ -462,6 +467,20 @@ pub struct Timeline { /// If Some, collects GetPage metadata for an ongoing PageTrace. pub(crate) page_trace: ArcSwapOption>, + + previous_heatmap: ArcSwapOption, + + /// May host a background Tokio task which downloads all the layers from the current + /// heatmap on demand. + heatmap_layers_downloader: Mutex>, +} + +pub(crate) enum PreviousHeatmap { + Active { + heatmap: HeatMapTimeline, + read_at: std::time::Instant, + }, + Obsolete, } pub type TimelineDeleteProgress = Arc>; @@ -1077,9 +1096,15 @@ impl Timeline { (history, gc_info.within_ancestor_pitr) } - /// Lock and get timeline's GC cutoff - pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { - self.latest_gc_cutoff_lsn.read() + /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen + pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.applied_gc_cutoff_lsn.read() + } + + /// Read timeline's planned GC cutoff: this is the logical end of history that users + /// are allowed to read (based on configured PITR), even if physically we have more history. + pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn { + self.gc_info.read().unwrap().cutoffs.time } /// Look up given page version. @@ -1274,7 +1299,7 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let read_path = if self.conf.enable_read_path_debugging { + let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { Some(ReadPath::new(keyspace.clone(), lsn)) } else { None @@ -1587,7 +1612,7 @@ impl Timeline { }; if init || validate { - let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); } @@ -1857,7 +1882,7 @@ impl Timeline { // Signal compaction failure to avoid L0 flush stalls when it's broken. match result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) => { + Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to @@ -2020,6 +2045,11 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // If we have a background task downloading heatmap layers stop it. + // The background downloads are sensitive to timeline cancellation (done above), + // so the drain will be immediate. + self.stop_and_drain_heatmap_layers_download().await; + // Ensure Prevent new page service requests from starting. self.handles.shutdown(); @@ -2337,6 +2367,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2559,6 +2597,7 @@ impl Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -2645,6 +2684,7 @@ impl Timeline { ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), @@ -2659,7 +2699,7 @@ impl Timeline { LastImageLayerCreationStatus::default(), )), - latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), current_logical_size: if disk_consistent_lsn.is_valid() { @@ -2721,6 +2761,10 @@ impl Timeline { create_idempotency, page_trace: Default::default(), + + previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), + + heatmap_layers_downloader: Mutex::new(None), }; result.repartition_threshold = @@ -3409,8 +3453,42 @@ impl Timeline { } } - pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { - self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) { + // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system + // for each of the database, but we only store one value, and therefore each pgdirmodification + // would overwrite the previous value if they modify different databases. + + match count { + MetricsUpdate::Set(count) => { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed); + } + MetricsUpdate::Add(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub + // the value reliably. + self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + MetricsUpdate::Sub(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before. + // The operation could overflow so we need to normalize the value. + let prev_val = + self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed); + let res = prev_val.saturating_sub(count); + self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + }; + + // TODO: remove this, there's no place in the code that updates this aux metrics. let aux_metric = self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); @@ -3459,12 +3537,52 @@ impl Timeline { let guard = self.layers.read().await; + // Firstly, if there's any heatmap left over from when this location + // was a secondary, take that into account. Keep layers that are: + // * present in the layer map + // * visible + // * non-resident + // * not evicted since we read the heatmap + // + // Without this, a new cold, attached location would clobber the previous + // heatamp. + let previous_heatmap = self.previous_heatmap.load(); + let visible_non_resident = match previous_heatmap.as_deref() { + Some(PreviousHeatmap::Active { heatmap, read_at }) => { + Some(heatmap.layers.iter().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; + + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } + + if layer.is_likely_resident() { + return None; + } + + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } + + Some((desc, hl.metadata.clone(), hl.access_time)) + })) + } + Some(PreviousHeatmap::Obsolete) => None, + None => None, + }; + + // Secondly, all currently visible, resident layers are included. let resident = guard.likely_resident_layers().filter_map(|layer| { match layer.visibility() { LayerVisibilityHint::Visible => { // Layer is visible to one or more read LSNs: elegible for inclusion in layer map let last_activity_ts = layer.latest_activity(); - Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) + Some(( + layer.layer_desc().clone(), + layer.metadata(), + last_activity_ts, + )) } LayerVisibilityHint::Covered => { // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. @@ -3473,7 +3591,18 @@ impl Timeline { } }); - let mut layers = resident.collect::>(); + let mut layers = match visible_non_resident { + Some(non_resident) => { + let mut non_resident = non_resident.peekable(); + if non_resident.peek().is_none() { + self.previous_heatmap + .store(Some(PreviousHeatmap::Obsolete.into())); + } + + non_resident.chain(resident).collect::>() + } + None => resident.collect::>(), + }; // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes @@ -3577,7 +3706,9 @@ impl Timeline { // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); - // Do not fire missing key error for sparse keys. + // Do not fire missing key error and end early for sparse keys. Note that we hava already removed + // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of + // figuring out what is the inherited key range and do a fine-grained pruning. removed.remove_overlapping_with(&KeySpace { ranges: vec![SPARSE_RANGE], }); @@ -3662,7 +3793,7 @@ impl Timeline { // the timeline, then it will remove layers that are required for fulfilling // the current get request (read-path cannot "look back" and notice the new // image layer). - let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn(); + let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn(); // See `compaction::compact_with_gc` for why we need this. let _guard = timeline.gc_compaction_layer_update_lock.read().await; @@ -4349,7 +4480,7 @@ impl Timeline { let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - *self.latest_gc_cutoff_lsn.read(), + *self.applied_gc_cutoff_lsn.read(), ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -4474,7 +4605,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(CompactionError::CollectKeySpaceError)?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -4995,20 +5129,26 @@ impl Timeline { // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation // if there are too many of them. - let num_of_l0_layers = { - let layers = self.layers.read().await; - layers.layer_map()?.level0_deltas().len() - }; let image_preempt_threshold = self.get_image_creation_preempt_threshold() * self.get_compaction_threshold(); - if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold { - tracing::info!( - "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}", - partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers - ); - last_partition_processed = Some(partition.clone()); - all_generated = false; - break; + // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield + // when there is a single timeline with more than L0 threshold L0 layers. As long as the + // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction. + if image_preempt_threshold != 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", + partition.start().unwrap(), partition.end().unwrap() + ); + last_partition_processed = Some(partition.clone()); + all_generated = false; + break; + } } } } @@ -5037,14 +5177,16 @@ impl Timeline { .map(|l| l.metadata().file_size) .sum::(); - info!( - "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", - image_layers.len(), - total_layer_size, - duration.as_secs_f64(), - partition_processed, - total_partitions - ); + if !image_layers.is_empty() { + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + } Ok(( image_layers, @@ -5187,6 +5329,8 @@ pub(crate) enum CompactionError { #[error("Failed to offload timeline: {0}")] Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. + #[error("Failed to collect keyspace: {0}")] + CollectKeySpaceError(CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), } @@ -5577,7 +5721,7 @@ impl Timeline { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { - time: *self.get_latest_gc_cutoff_lsn(), + time: *self.get_applied_gc_cutoff_lsn(), space: space_cutoff, } } @@ -5698,7 +5842,7 @@ impl Timeline { let mut result: GcResult = GcResult::default(); // Nothing to GC. Return early. - let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", @@ -5712,7 +5856,7 @@ impl Timeline { // // The GC cutoff should only ever move forwards. let waitlist = { - let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); + let write_guard = self.applied_gc_cutoff_lsn.lock_for_write(); if *write_guard > new_gc_cutoff { return Err(GcError::BadLsn { why: format!( @@ -6652,18 +6796,32 @@ fn is_send() { #[cfg(test)] mod tests { + use std::sync::Arc; + use pageserver_api::key::Key; use pageserver_api::value::Value; + use tracing::Instrument; use utils::{id::TimelineId, lsn::Lsn}; use crate::tenant::{ harness::{test_img, TenantHarness}, layer_map::LayerMap, - storage_layer::{Layer, LayerName}, + storage_layer::{Layer, LayerName, LayerVisibilityHint}, timeline::{DeltaLayerTestDesc, EvictionError}, - Timeline, + PreviousHeatmap, Timeline, }; + use super::HeatMapTimeline; + + fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { + assert_eq!(lhs.layers.len(), rhs.layers.len()); + let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + for (l, r) in lhs_rhs { + assert_eq!(l.name, r.name); + assert_eq!(l.metadata, r.metadata); + } + } + #[tokio::test] async fn test_heatmap_generation() { let harness = TenantHarness::create("heatmap_generation").await.unwrap(); @@ -6737,7 +6895,7 @@ mod tests { assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in heatmap.layers { + for layer in &heatmap.layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -6752,6 +6910,144 @@ mod tests { last_lsn = layer_lsn; } } + + // Evict all the layers and stash the old heatmap in the timeline. + // This simulates a migration to a cold secondary location. + + let guard = timeline.layers.read().await; + let mut all_layers = Vec::new(); + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + all_layers.push(layer.clone()); + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Generate a new heatmap and assert that it contains the same layers as the old one. + let post_migration_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap); + + // Download each layer one by one. Generate the heatmap at each step and check + // that it's stable. + for layer in all_layers { + if layer.visibility() == LayerVisibilityHint::Covered { + continue; + } + + eprintln!("Downloading {layer} and re-generating heatmap"); + + let _resident = layer + .download_and_keep_resident() + .instrument(tracing::info_span!( + parent: None, + "download_layer", + tenant_id = %timeline.tenant_shard_id.tenant_id, + shard_id = %timeline.tenant_shard_id.shard_slug(), + timeline_id = %timeline.timeline_id + )) + .await + .unwrap(); + + let post_download_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap); + } + + // Everything from the post-migration heatmap is now resident. + // Check that we drop it from memory. + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); + } + + #[tokio::test] + async fn test_previous_heatmap_obsoletion() { + let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion") + .await + .unwrap(); + + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + + let delta_layers = vec![l0_delta]; + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + // Both layers should be in the heatmap + assert!(!heatmap.layers.is_empty()); + + // Now simulate a migration. + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Evict all the layers in the previous heatmap + let guard = timeline.layers.read().await; + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + // Generate a new heatmap and check that the previous heatmap + // has been marked obsolete. + let post_eviction_heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert!(post_eviction_heatmap.layers.is_empty()); + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); } #[tokio::test] diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 5b915c50d3..4e4f906d78 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,7 +11,8 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, + ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -31,6 +32,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; @@ -301,18 +303,12 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.gc_guards.insert(id, gc_guard); } - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); } } GcCompactionQueueItem::SubCompactionJob(options) => { - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; } GcCompactionQueueItem::Notify(id) => { self.notify_and_unblock(id); @@ -692,21 +688,6 @@ impl Timeline { // Define partitioning schema if needed - let l0_l1_boundary_lsn = { - // We do the repartition on the L0-L1 boundary. All data below the boundary - // are compacted by L0 with low read amplification, thus making the `repartition` - // function run fast. - let guard = self.layers.read().await; - let l0_min_lsn = guard - .layer_map()? - .level0_deltas() - .iter() - .map(|l| l.get_lsn_range().start) - .min() - .unwrap_or(self.get_disk_consistent_lsn()); - l0_min_lsn.max(self.get_ancestor_lsn()) - }; - // 1. L0 Compact let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); @@ -733,86 +714,89 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } - if l0_l1_boundary_lsn < self.partitioning.read().1 { - // We never go backwards when repartition and create image layers. - info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN."); - } else { - // 2. Repartition and create image layers if necessary - match self - .repartition( - l0_l1_boundary_lsn, - self.get_compaction_target_size(), - options.flags, - ctx, - ) - .await - { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); + // 2. Repartition and create image layers if necessary + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + options.flags, + ctx, + ) + .await + { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); - let mut partitioning = dense_partitioning; - partitioning - .parts - .extend(sparse_partitioning.into_dense().parts); + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); - // 3. Create new image layers for partitions that have been modified "enough". - let (image_layers, outcome) = self - .create_image_layers( - &partitioning, - lsn, - if options - .flags - .contains(CompactFlags::ForceImageLayerCreation) - { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - self.last_image_layer_creation_status - .load() - .as_ref() - .clone(), - !options.flags.contains(CompactFlags::NoYield), - ) - .await - .inspect_err(|err| { - if let CreateImageLayersError::GetVectoredError( - GetVectoredError::MissingKey(_), - ) = err - { - critical!("missing key during compaction: {err:?}"); - } - })?; + // 3. Create new image layers for partitions that have been modified "enough". + let (image_layers, outcome) = self + .create_image_layers( + &partitioning, + lsn, + if options + .flags + .contains(CompactFlags::ForceImageLayerCreation) + { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + self.last_image_layer_creation_status + .load() + .as_ref() + .clone(), + !options.flags.contains(CompactFlags::NoYield), + ) + .await + .inspect_err(|err| { + if let CreateImageLayersError::GetVectoredError( + GetVectoredError::MissingKey(_), + ) = err + { + critical!("missing key during compaction: {err:?}"); + } + })?; - self.last_image_layer_creation_status - .store(Arc::new(outcome.clone())); + self.last_image_layer_creation_status + .store(Arc::new(outcome.clone())); - self.upload_new_image_layers(image_layers)?; - if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { - // Yield and do not do any other kind of compaction. - info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); - return Ok(CompactionOutcome::YieldForL0); - } + self.upload_new_image_layers(image_layers)?; + if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { + // Yield and do not do any other kind of compaction. + info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + return Ok(CompactionOutcome::YieldForL0); } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() && !err.is_cancelled() { + if let CompactionError::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)), + ) = err + { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); + } else { tracing::error!( "could not compact, repartitioning keyspace failed: {err:?}" ); } } - }; - } + } + }; let partition_count = self.partitioning.read().0 .0.parts.len(); @@ -852,7 +836,7 @@ impl Timeline { // // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. - let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", @@ -2202,7 +2186,7 @@ impl Timeline { // TODO: ensure the child branches will not use anything below the watermark, or consider // them when computing the watermark. - gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn()) + gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn()) } /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job. diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 93b7efedb8..841b2fa1c7 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -294,6 +294,7 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. + None, // Previous heatmap is not needed for deletion tenant.get_timeline_resources_for(remote_client), // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs new file mode 100644 index 0000000000..0ba9753e85 --- /dev/null +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -0,0 +1,162 @@ +//! Timeline utility module to hydrate everything from the current heatmap. +//! +//! Provides utilities to spawn and abort a background task where the downloads happen. +//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. + +use futures::StreamExt; +use http_utils::error::ApiError; +use std::sync::{Arc, Mutex}; +use tokio_util::sync::CancellationToken; +use utils::sync::gate::Gate; + +use super::Timeline; + +// This status is not strictly necessary now, but gives us a nice place +// to store progress information if we ever wish to expose it. +pub(super) enum HeatmapLayersDownloadStatus { + InProgress, + Complete, +} + +pub(super) struct HeatmapLayersDownloader { + handle: tokio::task::JoinHandle<()>, + status: Arc>, + cancel: CancellationToken, + downloads_guard: Arc, +} + +impl HeatmapLayersDownloader { + fn new( + timeline: Arc, + concurrency: usize, + ) -> Result { + let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; + + let cancel = timeline.cancel.child_token(); + let downloads_guard = Arc::new(Gate::default()); + + let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress)); + + let handle = tokio::task::spawn({ + let status = status.clone(); + let downloads_guard = downloads_guard.clone(); + let cancel = cancel.clone(); + + async move { + let _guard = tl_guard; + + scopeguard::defer! { + *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete; + } + + let Some(heatmap) = timeline.generate_heatmap().await else { + tracing::info!("Heatmap layers download failed to generate heatmap"); + return; + }; + + tracing::info!( + resident_size=%timeline.resident_physical_size(), + heatmap_layers=%heatmap.layers.len(), + "Starting heatmap layers download" + ); + + let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + |layer| { + let tl = timeline.clone(); + let dl_guard = match downloads_guard.enter() { + Ok(g) => g, + Err(_) => { + // [`Self::shutdown`] was called. Don't spawn any more downloads. + return None; + } + }; + + Some(async move { + let _dl_guard = dl_guard; + + let res = tl.download_layer(&layer.name).await; + if let Err(err) = res { + if !err.is_cancelled() { + tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") + } + } + }) + } + )).buffered(concurrency); + + tokio::select! { + _ = stream.collect::<()>() => { + tracing::info!( + resident_size=%timeline.resident_physical_size(), + "Heatmap layers download completed" + ); + }, + _ = cancel.cancelled() => { + tracing::info!("Heatmap layers download cancelled"); + } + } + } + }); + + Ok(Self { + status, + handle, + cancel, + downloads_guard, + }) + } + + fn is_complete(&self) -> bool { + matches!( + *self.status.lock().unwrap(), + HeatmapLayersDownloadStatus::Complete + ) + } + + /// Drive any in-progress downloads to completion and stop spawning any new ones. + /// + /// This has two callers and they behave differently + /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves + /// are sensitive to timeline cancellation. + /// + /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress + /// downloads to complete. + async fn stop_and_drain(self) { + // Counterintuitive: close the guard before cancelling. + // Something needs to poll the already created download futures to completion. + // If we cancel first, then the underlying task exits and we lost + // the poller. + self.downloads_guard.close().await; + self.cancel.cancel(); + if let Err(err) = self.handle.await { + tracing::warn!("Failed to join heatmap layer downloader task: {err}"); + } + } +} + +impl Timeline { + pub(crate) async fn start_heatmap_layers_download( + self: &Arc, + concurrency: usize, + ) -> Result<(), ApiError> { + let mut locked = self.heatmap_layers_downloader.lock().unwrap(); + if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { + let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + *locked = Some(dl); + Ok(()) + } else { + Err(ApiError::Conflict("Already running".to_string())) + } + } + + pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) { + // This can race with the start of a new downloader and lead to a situation + // where one donloader is shutting down and another one is in-flight. + // The only impact is that we'd end up using more remote storage semaphore + // units than expected. + let downloader = self.heatmap_layers_downloader.lock().unwrap().take(); + if let Some(dl) = downloader { + dl.stop_and_drain().await; + } + } +} diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3b5bf8290c..93e5a1100d 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -7,7 +7,9 @@ use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; -use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; +use crate::tenant::{ + DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, +}; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { @@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let (timeline, guard) = make_timeline_delete_guard( + let delete_guard_res = make_timeline_delete_guard( tenant, timeline.timeline_id, TimelineDeleteGuardKind::Offload, - ) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + ); + if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); + return Err(OffloadError::NotArchived); + }; + let (timeline, guard) = + delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 9d539198c7..c966ad813f 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { /// bad storage or bad configuration, and we can't fix that from inside /// a running process. pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { - tracing::error!("Fatal I/O error: {e}: {context})"); + let backtrace = std::backtrace::Backtrace::force_capture(); + tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}"); std::process::abort(); } @@ -947,13 +948,18 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { - let file_guard = match self.lock_file().await { + let file_guard = match self + .lock_file() + .await + .maybe_fatal_err("lock_file inside VirtualFileInner::read_at") + { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), }; observe_duration!(StorageIoOperation::Read, { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { STORAGE_IO_SIZE .with_label_values(&[ diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index e38af08f89..0331f961b4 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,10 +14,12 @@ #include "utils/guc.h" -#include "extension_server.h" +#include "extension_server.h" #include "neon_utils.h" static int extension_server_port = 0; +static int extension_server_request_timeout = 60; +static int extension_server_connect_timeout = 60; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - static CURL *handle = NULL; - CURLcode res; - char *compute_ctl_url; bool ret = false; + CURL *handle = NULL; + char *compute_ctl_url; - if (handle == NULL) - { - handle = alloc_curl_handle(); + handle = alloc_curl_handle(); - curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - } + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + if (extension_server_request_timeout > 0) + curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ ); + if (extension_server_connect_timeout > 0) + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ ); compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", extension_server_port, filename, is_library ? "?is_library=true" : ""); @@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library) /* Perform the request, res will get the return code */ res = curl_easy_perform(handle); + curl_easy_cleanup(handle); + /* Check for errors */ if (res == CURLE_OK) { @@ -88,6 +91,24 @@ pg_init_extension_server() 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.extension_server_request_timeout", + "timeout for fetching extensions in seconds", + NULL, + &extension_server_request_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.extension_server_connect_timeout", + "timeout for connecting to the extension server in seconds", + NULL, + &extension_server_connect_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + /* set download_extension_file_hook */ prev_download_extension_file_hook = download_extension_file_hook; download_extension_file_hook = neon_download_extension_file_http; diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 1f53c8fd36..bbaad09f5f 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash) index = hash >> HLL_C_BITS; /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ - count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; + Assert(count <= HLL_C_BITS); cState->regs[index][count] = now; } @@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since) { if (reg[i] >= since) { - max = i; + max = i + 1; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 22aeb2e2d6..fc1aecd340 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel) { case PS_Disconnected: { - const char *keywords[3]; - const char *values[3]; + const char *keywords[4]; + const char *values[4]; + char pid_str[16]; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; @@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel) * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ - keywords[0] = "dbname"; - values[0] = connstr; - n_pgsql_params = 1; + n_pgsql_params = 0; + + /* + * Pageserver logs include this in the connection's tracing span. + * This allows for reasier log correlation between compute and pageserver. + */ + keywords[n_pgsql_params] = "application_name"; + { + int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid); + if (ret < 0 || ret >= (int)(sizeof(pid_str))) + elog(FATAL, "stack-allocated buffer too small to hold pid"); + } + /* lifetime: PQconnectStartParams strdups internally */ + values[n_pgsql_params] = (const char*) pid_str; + n_pgsql_params++; + + keywords[n_pgsql_params] = "dbname"; + values[n_pgsql_params] = connstr; + n_pgsql_params++; if (neon_auth_token) { - keywords[1] = "password"; - values[1] = neon_auth_token; + keywords[n_pgsql_params] = "password"; + values[n_pgsql_params] = neon_auth_token; n_pgsql_params++; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8051970176..f1087a8ccb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode) * neon_truncate() -- Truncate relation to specified number of blocks. */ static void -neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { XLogRecPtr lsn; @@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); return; default: @@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); #endif } diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index a45e8f5c4a..74cd5ac601 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); #if PG_MAJORVERSION_NUM >= 17 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); @@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum) * inmem_truncate() -- Truncate relation to specified number of blocks. */ static void -inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { } diff --git a/poetry.lock b/poetry.lock index fd200159b9..d66c3aae7a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -412,6 +412,7 @@ files = [ [package.dependencies] botocore-stubs = "*" +mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""} mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""} types-s3transfer = "*" typing-extensions = ">=4.1.0" @@ -2022,6 +2023,18 @@ install-types = ["pip"] mypyc = ["setuptools (>=50)"] reports = ["lxml"] +[[package]] +name = "mypy-boto3-kms" +version = "1.26.147" +description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"}, + {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"}, +] + [[package]] name = "mypy-boto3-s3" version = "1.26.0.post1" @@ -2758,18 +2771,18 @@ pytest = ">=5,<8" [[package]] name = "pytest-timeout" -version = "2.1.0" +version = "2.3.1" description = "pytest plugin to abort hanging tests" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" groups = ["main"] files = [ - {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, - {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, + {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"}, + {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"}, ] [package.dependencies] -pytest = ">=5.0.0" +pytest = ">=7.0.0" [[package]] name = "pytest-xdist" @@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7" +content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 3aa6ac3a76..6a381bf094 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -19,7 +19,6 @@ aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true -boxcar = "0.2.8" bstr.workspace = true bytes = { workspace = true, features = ["serde"] } camino.workspace = true @@ -63,7 +62,6 @@ postgres_backend.workspace = true postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true -prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } @@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true -strum.workspace = true strum_macros.workspace = true subtle.workspace = true thiserror.workspace = true @@ -95,7 +92,6 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true tracing-log.workspace = true -tracing-serde.workspace = true tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 9be29c38c9..7503b4eac9 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -140,9 +140,8 @@ async fn authenticate( let (psql_session_id, waiter) = loop { let psql_session_id = new_psql_session_id(); - match control_plane::mgmt::get_waiter(&psql_session_id) { - Ok(waiter) => break (psql_session_id, waiter), - Err(_e) => continue, + if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) { + break (psql_session_id, waiter); } }; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index e05a693cee..5d032c0deb 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -220,11 +220,11 @@ async fn fetch_jwks( } impl JwkCacheEntryLock { - async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + async fn acquire_permit(self: &Arc) -> JwkRenewalPermit<'_> { JwkRenewalPermit::acquire_permit(self).await } - fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + fn try_acquire_permit(self: &Arc) -> Option> { JwkRenewalPermit::try_acquire_permit(self) } @@ -393,7 +393,7 @@ impl JwkCacheEntryLock { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => return Err(JwtError::UnsupportedKeyType(key.into())), - }; + } tracing::debug!(?payload, "JWT signature valid with claims"); @@ -510,7 +510,7 @@ fn verify_rsa_signature( key.verify(data, &sig)?; } _ => return Err(JwtError::InvalidRsaSigningAlgorithm), - }; + } Ok(()) } diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index e0d8515375..4ab11f828c 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -4,6 +4,20 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use compute_api::spec::LocalProxySpec; +use futures::future::Either; +use thiserror::Error; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; + use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; use crate::auth::{self}; @@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions}; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::types::RoleName; use crate::url::ApiUrl; -use anyhow::{bail, ensure, Context}; -use camino::{Utf8Path, Utf8PathBuf}; -use compute_api::spec::LocalProxySpec; -use futures::future::Either; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); -use clap::Parser; -use thiserror::Error; -use tokio::net::TcpListener; -use tokio::sync::Notify; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use utils::sentry_init::init_sentry; -use utils::{pid_file, project_build_tag, project_git_version}; - /// Neon proxy/router #[derive(Parser)] #[command(version = GIT_VERSION, about)] diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 235e9674c6..94e771a61c 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -5,12 +5,6 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; -use crate::context::RequestContext; -use crate::metrics::{Metrics, ThreadPoolMetrics}; -use crate::protocol2::ConnectionInfo; -use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use crate::stream::{PqStream, Stream}; -use crate::tls::TlsServerEndPoint; use anyhow::{anyhow, bail, ensure, Context}; use clap::Arg; use futures::future::Either; @@ -25,6 +19,13 @@ use tracing::{error, info, Instrument}; use utils::project_git_version; use utils::sentry_init::init_sentry; +use crate::context::RequestContext; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::protocol2::ConnectionInfo; +use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; + project_git_version!(GIT_VERSION); fn cli() -> clap::Command { diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e38c49ca10..b72799df54 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -3,6 +3,16 @@ use std::pin::pin; use std::sync::Arc; use std::time::Duration; +use anyhow::bail; +use futures::future::Either; +use remote_storage::RemoteStorageConfig; +use tokio::net::TcpListener; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; + use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; use crate::cancellation::{handle_cancel_messages, CancellationHandler}; @@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::{auth, control_plane, http, serverless, usage_metrics}; -use anyhow::bail; -use futures::future::Either; -use remote_storage::RemoteStorageConfig; -use tokio::net::TcpListener; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::sentry_init::init_sentry; -use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> { match auth_backend { Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), - }; + } info!("Using region: {}", args.aws_region); // TODO: untangle the config args @@ -803,9 +804,10 @@ fn build_auth_backend( mod tests { use std::time::Duration; - use crate::rate_limiter::RateBucketInfo; use clap::Parser; + use crate::rate_limiter::RateBucketInfo; + #[test] fn parse_endpoint_rps_limit() { let config = super::ProxyCliArgs::parse_from([ diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index b5c42cd23d..8ec1a4648b 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -242,7 +242,7 @@ impl EndpointsCache { }); tracing::error!("error parsing value {value:?}: {err:?}"); } - }; + } } if total.is_power_of_two() { tracing::debug!("endpoints read {}", total); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index e84f1676e2..1f9c8a48b7 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -501,7 +501,7 @@ impl Session { _guard: Metrics::get() .proxy .cancel_channel_size - .guard(RedisMsgKind::HSet), + .guard(RedisMsgKind::HDel), }; let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index d71465765f..5447a4a4c0 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -137,8 +137,8 @@ impl ConnCfg { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. - "user" if self.user_is_set() => continue, - "database" if self.db_is_set() => continue, + "user" if self.user_is_set() => {} + "database" if self.db_is_set() => {} "options" => { if let Some(options) = filtered_options(v) { self.set_param(k, &options); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index c4548a7ddd..1044f5f8e2 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -82,7 +82,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index f92e4f3f60..89ec4f9b33 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; -use crate::intern::AccountIdInt; -use crate::intern::ProjectIdInt; +use crate::intern::{AccountIdInt, ProjectIdInt}; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, scram}; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 97c9f5a59c..fbd4811b54 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -7,9 +7,8 @@ use chrono::{DateTime, Utc}; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; -use tracing::span; use tracing::subscriber::Interest; -use tracing::{callsite, Event, Metadata, Span, Subscriber}; +use tracing::{callsite, span, Event, Metadata, Span, Subscriber}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 0dc97b7097..74a15d9bf4 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol( // if no more bytes available then exit if bytes_read == 0 { return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - }; + } // check if we have enough bytes to continue if let Some(header) = buf.try_get::() { @@ -169,7 +169,7 @@ fn process_proxy_payload( header.version_and_command ), )), - }; + } let size_err = "invalid proxy protocol length. payload not large enough to fit requested IP addresses"; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index dd145e6bb2..26fb1754bf 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -198,7 +198,7 @@ where warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } - }; + } let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 8a407c8119..2a406fcb34 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -118,7 +118,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 1a7024588a..5f9f2509e2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -169,7 +169,7 @@ impl MessageHandler { }); tracing::error!("broken message: {e}"); } - }; + } return Ok(()); } Ok(msg) => msg, @@ -180,7 +180,7 @@ impl MessageHandler { match serde_json::from_str::(&payload) { Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), Err(_) => tracing::error!("broken message: {e}"), - }; + } return Ok(()); } }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index edc2935618..f35c375ba2 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -372,7 +372,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.execute("select auth.init()", &[]).await { + if let Err(e) = client.batch_execute("select auth.init();").await { discard.discard(); return Err(e.into()); } @@ -651,7 +651,7 @@ async fn connect_http2( e, ))); } - }; + } }; let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index fe33f0ff65..137a2d6377 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -23,7 +23,6 @@ use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; -use postgres_client::types::ToSql; use postgres_client::AsyncMessage; use serde_json::value::RawValue; use tokio::net::TcpStream; @@ -280,14 +279,13 @@ impl ClientInnerCommon { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; - // initiates the auth session + // discard all cannot run in a transaction. must be executed alone. self.inner.batch_execute("discard all").await?; - self.inner - .execute( - "select auth.jwt_session_init($1)", - &[&&*token as &(dyn ToSql + Sync)], - ) - .await?; + + // initiates the auth session + // this is safe from query injections as the jwt format free of any escape characters. + let query = format!("select auth.jwt_session_init('{token}')"); + self.inner.batch_execute(&query).await?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); diff --git a/pyproject.toml b/pyproject.toml index e299c421e9..92a660c233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,12 @@ Jinja2 = "^3.1.5" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" -boto3-stubs = {extras = ["s3"], version = "^1.26.16"} +boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"} moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" -pytest-timeout = "^2.1.0" +pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" diff --git a/pytest.ini b/pytest.ini index 7197b078c6..237066b1f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,7 +11,7 @@ markers = testpaths = test_runner minversion = 6.0 -log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s +log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true timeout = 300 diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index df049f3eba..5c305769dd 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -5,7 +5,10 @@ use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus}; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; use std::error::Error as _; use utils::{ id::{NodeId, TenantId, TimelineId}, @@ -32,6 +35,9 @@ pub enum Error { /// Status is not ok; parsed error in body as `HttpErrorBody`. #[error("safekeeper API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -85,6 +91,12 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { + let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, @@ -124,9 +136,10 @@ impl Client { self.get(&uri).await } - pub async fn utilization(&self) -> Result { - let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint); - self.get(&uri).await + pub async fn utilization(&self) -> Result { + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) } async fn post( diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index e92ca881e1..35aebfd8ad 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -235,7 +235,7 @@ impl Storage for FileStorage { #[cfg(test)] mod test { use super::*; - use safekeeper_api::membership::{Configuration, MemberSet}; + use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; @@ -246,7 +246,7 @@ mod test { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); state.mconf = Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::empty(), new_members: None, }; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a64bf1ddd8..cd2ac5f44c 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -2,6 +2,7 @@ use http_utils::failpoints::failpoints_handler; use hyper::{Body, Request, Response, StatusCode}; use safekeeper_api::models; use safekeeper_api::models::AcceptorStateStatus; +use safekeeper_api::models::PullTimelineRequest; use safekeeper_api::models::SafekeeperStatus; use safekeeper_api::models::TermSwitchApiEntry; use safekeeper_api::models::TimelineStatus; @@ -230,7 +231,7 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result, ApiError> { check_permission(&request, None)?; - let data: pull_timeline::Request = json_request(&mut request).await?; + let data: PullTimelineRequest = json_request(&mut request).await?; let conf = get_conf(&request); let global_timelines = get_global_timelines(&request); @@ -626,7 +627,7 @@ pub fn make_router( failpoints_handler(r, cancel).await }) }) - .get("/v1/uzilization", |r| request_span(r, utilization_handler)) + .get("/v1/utilization", |r| request_span(r, utilization_handler)) .delete("/v1/tenant/:tenant_id", |r| { request_span(r, tenant_delete_handler) }) diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f2d8e4c85f..4827b73074 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -4,10 +4,13 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{models::TimelineStatus, Term}; +use safekeeper_api::{ + models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, + Term, +}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use std::{ cmp::min, io::{self, ErrorKind}, @@ -33,7 +36,7 @@ use crate::{ }; use utils::{ crashsafe::fsync_async_opt, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantTimelineId}, logging::SecretString, lsn::Lsn, pausable_failpoint, @@ -378,21 +381,6 @@ impl WalResidentTimeline { } } -/// pull_timeline request body. -#[derive(Debug, Deserialize)] -pub struct Request { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub http_hosts: Vec, -} - -#[derive(Debug, Serialize)] -pub struct Response { - // Donor safekeeper host - pub safekeeper_host: String, - // TODO: add more fields? -} - /// Response for debug dump request. #[derive(Debug, Deserialize)] pub struct DebugDumpResponse { @@ -405,10 +393,10 @@ pub struct DebugDumpResponse { /// Find the most advanced safekeeper and pull timeline from it. pub async fn handle_request( - request: Request, + request: PullTimelineRequest, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -460,7 +448,7 @@ async fn pull_timeline( host: String, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -535,7 +523,7 @@ async fn pull_timeline( .load_temp_timeline(ttid, &tli_dir_path, false) .await?; - Ok(Response { + Ok(PullTimelineResponse { safekeeper_host: host, }) } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 45e19c31b6..f816f8459a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1004,7 +1004,7 @@ mod tests { use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperId}, + membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, ServerInfo, }; @@ -1303,7 +1303,7 @@ mod tests { tenant_id, timeline_id, mconf: Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::new(vec![SafekeeperId { id: NodeId(1), host: "hehe.org".to_owned(), diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 8517fa0344..2f6b91cf47 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -310,9 +310,12 @@ impl WalBackupTask { retry_attempt = 0; } Err(e) => { + // We might have managed to upload some segment even though + // some later in the range failed, so log backup_lsn + // separately. error!( - "failed while offloading range {}-{}: {:?}", - backup_lsn, commit_lsn, e + "failed while offloading range {}-{}, backup_lsn {}: {:?}", + backup_lsn, commit_lsn, backup_lsn, e ); retry_attempt = retry_attempt.saturating_add(1); @@ -338,6 +341,13 @@ async fn backup_lsn_range( let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + info!( + "offloading segnos {:?} of range [{}-{})", + segments.iter().map(|&s| s.seg_no).collect::>(), + start_lsn, + end_lsn, + ); + // Pool of concurrent upload tasks. We use `FuturesOrdered` to // preserve order of uploads, and update `backup_lsn` only after // all previous uploads are finished. @@ -374,10 +384,10 @@ async fn backup_lsn_range( } info!( - "offloaded segnos {:?} up to {}, previous backup_lsn {}", + "offloaded segnos {:?} of range [{}-{})", segments.iter().map(|&s| s.seg_no).collect::>(), - end_lsn, start_lsn, + end_lsn, ); Ok(()) } diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ebcb060e7..e5ccbb3230 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; +use std::os::fd::AsRawFd; + use crate::metrics::TrafficMetrics; use crate::SafeKeeperConf; use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; @@ -62,6 +64,7 @@ async fn handle_socket( global_timelines: Arc, ) -> Result<(), QueryError> { socket.set_nodelay(true)?; + let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr()?; // Set timeout on reading from the socket. It prevents hanged up connection @@ -107,7 +110,7 @@ async fn handle_socket( auth_pair, global_timelines, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py index a2f553d290..915eb33673 100644 --- a/scripts/generate_image_maps.py +++ b/scripts/generate_image_maps.py @@ -6,6 +6,9 @@ build_tag = os.environ["BUILD_TAG"] branch = os.environ["BRANCH"] dev_acr = os.environ["DEV_ACR"] prod_acr = os.environ["PROD_ACR"] +dev_aws = os.environ["DEV_AWS"] +prod_aws = os.environ["PROD_AWS"] +aws_region = os.environ["AWS_REGION"] components = { "neon": ["neon"], @@ -24,11 +27,11 @@ components = { registries = { "dev": [ "docker.io/neondatabase", - "369495373322.dkr.ecr.eu-central-1.amazonaws.com", + f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], "prod": [ - "093970136003.dkr.ecr.eu-central-1.amazonaws.com", + f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{prod_acr}.azurecr.io/neondatabase", ], } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 91d8098cb9..73dc1a5c10 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -32,6 +32,10 @@ postgres_connection.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true +safekeeper_api.workspace = true +safekeeper_client.workspace = true +tikv-jemallocator.workspace = true +regex.workspace = true rustls-native-certs.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index b7e66d33eb..52b6110667 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,23 +1,30 @@ use futures::{stream::FuturesUnordered, StreamExt}; +use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_client::mgmt_api; use std::{ collections::HashMap, + fmt::Debug, + future::Future, sync::Arc, time::{Duration, Instant}, }; use tokio_util::sync::CancellationToken; -use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; +use pageserver_api::{ + controller_api::{NodeAvailability, SkSchedulingPolicy}, + models::PageserverUtilization, +}; use thiserror::Error; -use utils::id::NodeId; +use utils::{id::NodeId, logging::SecretString}; -use crate::node::Node; +use crate::{node::Node, safekeeper::Safekeeper}; -struct HeartbeaterTask { - receiver: tokio::sync::mpsc::UnboundedReceiver, +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver>, cancel: CancellationToken, - state: HashMap, + state: HashMap, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -36,8 +43,17 @@ pub(crate) enum PageserverState { Offline, } +#[derive(Debug, Clone)] +pub(crate) enum SafekeeperState { + Available { + last_seen_at: Instant, + utilization: SafekeeperUtilization, + }, + Offline, +} + #[derive(Debug)] -pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, State)>); #[derive(Debug, Error)] pub(crate) enum HeartbeaterError { @@ -45,23 +61,28 @@ pub(crate) enum HeartbeaterError { Cancel, } -struct HeartbeatRequest { - pageservers: Arc>, - reply: tokio::sync::oneshot::Sender>, +struct HeartbeatRequest { + servers: Arc>, + reply: tokio::sync::oneshot::Sender, HeartbeaterError>>, } -pub(crate) struct Heartbeater { - sender: tokio::sync::mpsc::UnboundedSender, +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender>, } -impl Heartbeater { +#[allow(private_bounds)] +impl Heartbeater +where + HeartbeaterTask: HeartBeat, +{ pub(crate) fn new( jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { - let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let (sender, receiver) = + tokio::sync::mpsc::unbounded_channel::>(); let mut heartbeater = HeartbeaterTask::new( receiver, jwt_token, @@ -76,12 +97,12 @@ impl Heartbeater { pub(crate) async fn heartbeat( &self, - pageservers: Arc>, - ) -> Result { + servers: Arc>, + ) -> Result, HeartbeaterError> { let (sender, receiver) = tokio::sync::oneshot::channel(); self.sender .send(HeartbeatRequest { - pageservers, + servers, reply: sender, }) .map_err(|_| HeartbeaterError::Cancel)?; @@ -93,9 +114,12 @@ impl Heartbeater { } } -impl HeartbeaterTask { +impl HeartbeaterTask +where + HeartbeaterTask: HeartBeat, +{ fn new( - receiver: tokio::sync::mpsc::UnboundedReceiver, + receiver: tokio::sync::mpsc::UnboundedReceiver>, jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -110,15 +134,19 @@ impl HeartbeaterTask { jwt_token, } } - async fn run(&mut self) { loop { tokio::select! { request = self.receiver.recv() => { match request { Some(req) => { - let res = self.heartbeat(req.pageservers).await; - req.reply.send(res).unwrap(); + if req.reply.is_closed() { + // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them + continue; + } + let res = self.heartbeat(req.servers).await; + // Ignore the return value in order to not panic if the heartbeat function's future was cancelled + _ = req.reply.send(res); }, None => { return; } } @@ -127,11 +155,20 @@ impl HeartbeaterTask { } } } +} +pub(crate) trait HeartBeat { + fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> impl Future, HeartbeaterError>> + Send; +} + +impl HeartBeat for HeartbeaterTask { async fn heartbeat( &mut self, pageservers: Arc>, - ) -> Result { + ) -> Result, HeartbeaterError> { let mut new_state = HashMap::new(); let mut heartbeat_futs = FuturesUnordered::new(); @@ -272,3 +309,130 @@ impl HeartbeaterTask { Ok(AvailablityDeltas(deltas)) } } + +impl HeartBeat for HeartbeaterTask { + async fn heartbeat( + &mut self, + safekeepers: Arc>, + ) -> Result, HeartbeaterError> { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, sk) in &*safekeepers { + if sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned { + continue; + } + heartbeat_futs.push({ + let jwt_token = self + .jwt_token + .as_ref() + .map(|t| SecretString::from(t.to_owned())); + let cancel = self.cancel.clone(); + + async move { + let response = sk + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 3, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let status = match response { + Ok(utilization) => SafekeeperState::Available { + last_seen_at: Instant::now(), + utilization, + }, + Err(mgmt_api::Error::Cancelled) => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + Err(e) => { + tracing::info!( + "Marking safekeeper {} at as offline: {e}", + sk.base_url() + ); + SafekeeperState::Offline + } + }; + + Some((*node_id, status)) + } + }); + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + } + + let mut offline = 0; + for state in new_state.values() { + match state { + SafekeeperState::Offline { .. } => offline += 1, + SafekeeperState::Available { .. } => {} + } + } + + tracing::info!( + "Heartbeat round complete for {} safekeepers, {} offline", + new_state.len(), + offline + ); + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, sk_state) in new_state.iter_mut() { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(*node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &sk_state) { + (SafekeeperState::Offline, SafekeeperState::Offline) => {} + (SafekeeperState::Available { last_seen_at, .. }, SafekeeperState::Offline) => { + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + } + _ => { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + // This is a new node. Don't generate a delta for it. + deltas.push((*node_id, sk_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = sk_state.clone(); + } + Vacant(vac) => { + vac.insert(sk_state.clone()); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 1a56116cad..1cc61a12e8 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -9,7 +9,10 @@ use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECON use anyhow::Context; use futures::Future; use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, request_span}, + endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, + }, error::ApiError, failpoints::failpoints_handler, json::{json_request, json_response}, @@ -54,7 +57,7 @@ pub struct HttpState { service: Arc, auth: Option>, neon_metrics: NeonMetrics, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], } impl HttpState { @@ -63,15 +66,17 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { - let allowlist_routes = ["/status", "/ready", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); Self { service, auth, neon_metrics: NeonMetrics::new(build_info), - allowlist_routes, + allowlist_routes: &[ + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ], } } } @@ -516,6 +521,35 @@ async fn handle_tenant_timeline_block_unblock_gc( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_download_heatmap_layers( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let concurrency: Option = parse_query_param(&req, "concurrency")?; + + service + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await?; + + json_response(StatusCode::OK, ()) +} + +// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters +// and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to +// compare to, so we can just filter out our well known ID format with regexes. +fn path_without_ids(path: &str) -> String { + static ID_REGEX: std::sync::OnceLock = std::sync::OnceLock::new(); + ID_REGEX + .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap()) + .replace_all(path, "") + .to_string() +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -551,10 +585,7 @@ async fn handle_tenant_timeline_passthrough( .metrics_group .storage_controller_passthrough_request_latency; - // This is a bit awkward. We remove the param from the request - // and join the words by '_' to get a label for the request. - let just_path = path.replace(&tenant_shard_str, ""); - let path_label = just_path + let path_label = path_without_ids(&path) .split('/') .filter(|token| !token.is_empty()) .collect::>() @@ -1390,23 +1421,26 @@ pub fn prologue_leadership_status_check_middleware< let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - enum AllowedRoutes<'a> { + enum AllowedRoutes { All, - Some(Vec<&'a str>), + Some(&'static [&'static str]), } let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, LeadershipStatus::SteppedDown => AllowedRoutes::All, - LeadershipStatus::Candidate => { - AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::Candidate => AllowedRoutes::Some(&[ + "/ready", + "/status", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]), }; - let uri = req.uri().to_string(); match allowed_routes { AllowedRoutes::All => Ok(req), - AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req), _ => { tracing::info!( "Request {} not allowed due to current leadership state", @@ -1515,7 +1549,8 @@ enum ForwardOutcome { /// Potentially forward the request to the current storage controler leader. /// More specifically we forward when: -/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 1. Request is not one of: +/// ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"] /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state /// 3. There is a leader in the database to forward to /// 4. Leader from step (3) is not the current instance @@ -1536,10 +1571,17 @@ enum ForwardOutcome { /// Hence, if we are in the edge case scenario the leader persisted in the database is the /// stepped down instance that received the request. Condition (4) above covers this scenario. async fn maybe_forward(req: Request) -> ForwardOutcome { - const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + const NOT_FOR_FORWARD: &[&str] = &[ + "/control/v1/step_down", + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]; - let uri = req.uri().to_string(); - let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + let uri = req.uri(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path()); // Fast return before trying to take any Service locks, if we will never forward anyway if !uri_for_forward { @@ -1739,7 +1781,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -1752,13 +1794,19 @@ pub fn make_router( .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) - // Non-prefixed generic endpoints (status, metrics) + // Non-prefixed generic endpoints (status, metrics, profiling) .get("/status", |r| { named_request_span(r, handle_status, RequestName("status")) }) .get("/ready", |r| { named_request_span(r, handle_ready, RequestName("ready")) }) + .get("/profile/cpu", |r| { + named_request_span(r, profile_cpu_handler, RequestName("profile_cpu")) + }) + .get("/profile/heap", |r| { + named_request_span(r, profile_heap_handler, RequestName("profile_heap")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) @@ -2070,6 +2118,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_download_heatmap_layers, + RequestName("v1_tenant_timeline_download_heatmap_layers"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -2089,3 +2147,16 @@ pub fn make_router( ) }) } + +#[cfg(test)] +mod test { + + use super::path_without_ids; + + #[test] + fn test_path_without_ids() { + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/"); + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index f5823935e1..5f2c081927 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -17,6 +17,8 @@ mod pageserver_client; mod peer_client; pub mod persistence; mod reconciler; +mod safekeeper; +mod safekeeper_client; mod scheduler; mod schema; pub mod service; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 07279a67ff..9a9958f7a6 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -12,7 +12,8 @@ use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, - MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -26,6 +27,16 @@ use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -75,10 +86,14 @@ struct Cli { #[arg(long)] split_threshold: Option, - /// Maximum number of reconcilers that may run in parallel + /// Maximum number of normal-priority reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, + /// Maximum number of high-priority reconcilers that may run in parallel + #[arg(long)] + priority_reconciler_concurrency: Option, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -289,6 +304,9 @@ async fn async_main() -> anyhow::Result<()> { reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + priority_reconciler_concurrency: args + .priority_reconciler_concurrency + .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 4164e3dc2b..6d67e0d130 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -80,6 +80,11 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_error: measured::CounterVec, + /// Count of HTTP requests to the safekeeper that resulted in an error, + /// broken down by the safekeeper node id, request name and method + pub(crate) storage_controller_safekeeper_request_error: + measured::CounterVec, + /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. @@ -87,6 +92,13 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, + /// Latency of HTTP requests to the safekeeper, broken down by safekeeper + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_safekeeper_request_latency: + measured::HistogramVec, + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_passthrough_request_error: diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 141ff6f720..645cbdfce1 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -280,6 +280,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + measured_request!( + "download_heatmap_layers", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index c4e5b39589..67b60eadf3 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1185,23 +1185,6 @@ impl Persistence { Ok(safekeepers) } - pub(crate) async fn safekeeper_get( - &self, - id: i64, - ) -> Result { - use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; - self.with_conn(move |conn| { - Box::pin(async move { - Ok(safekeepers - .filter(id_column.eq(&id)) - .select(SafekeeperPersistence::as_select()) - .get_result(conn) - .await?) - }) - }) - .await - } - pub(crate) async fn safekeeper_upsert( &self, record: SafekeeperUpsert, @@ -1554,6 +1537,21 @@ pub(crate) struct SafekeeperPersistence { } impl SafekeeperPersistence { + pub(crate) fn from_upsert( + upsert: SafekeeperUpsert, + scheduling_policy: SkSchedulingPolicy, + ) -> Self { + crate::persistence::SafekeeperPersistence { + id: upsert.id, + region_id: upsert.region_id, + version: upsert.version, + host: upsert.host, + port: upsert.port, + http_port: upsert.http_port, + availability_zone_id: upsert.availability_zone_id, + scheduling_policy: String::from(scheduling_policy), + } + } pub(crate) fn as_describe_response(&self) -> Result { let scheduling_policy = SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 58bc0ba1cd..48f0804926 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,7 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::{compute_hook, service}; -use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy}; +use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, }; @@ -91,9 +91,10 @@ pub(crate) struct ReconcilerConfigBuilder { } impl ReconcilerConfigBuilder { - pub(crate) fn new() -> Self { + /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default + pub(crate) fn new(priority: ReconcilerPriority) -> Self { Self { - config: ReconcilerConfig::default(), + config: ReconcilerConfig::new(priority), } } @@ -129,8 +130,18 @@ impl ReconcilerConfigBuilder { } } -#[derive(Default, Debug, Copy, Clone)] +// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling +// things on node changes) does not starve user-facing tasks. +#[derive(Debug, Copy, Clone)] +pub(crate) enum ReconcilerPriority { + Normal, + High, +} + +#[derive(Debug, Copy, Clone)] pub(crate) struct ReconcilerConfig { + pub(crate) priority: ReconcilerPriority, + // During live migration give up on warming-up the secondary // after this timeout. secondary_warmup_timeout: Option, @@ -145,6 +156,18 @@ pub(crate) struct ReconcilerConfig { } impl ReconcilerConfig { + /// Configs are always constructed with an explicit priority, to force callers to think about whether + /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because + /// scheduling something user-facing at normal priority can result in it getting starved out by background work. + pub(crate) fn new(priority: ReconcilerPriority) -> Self { + Self { + priority, + secondary_warmup_timeout: None, + secondary_download_request_timeout: None, + tenant_creation_hint: false, + } + } + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); self.secondary_warmup_timeout @@ -162,6 +185,24 @@ impl ReconcilerConfig { } } +impl From<&MigrationConfig> for ReconcilerConfig { + fn from(value: &MigrationConfig) -> Self { + // Run reconciler at high priority because MigrationConfig comes from human requests that should + // be presumed urgent. + let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High); + + if let Some(timeout) = value.secondary_warmup_timeout { + builder = builder.secondary_warmup_timeout(timeout) + } + + if let Some(timeout) = value.secondary_download_request_timeout { + builder = builder.secondary_download_request_timeout(timeout) + } + + builder.build() + } +} + /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O pub(crate) struct ReconcileUnits { _sem_units: tokio::sync::OwnedSemaphorePermit, diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs new file mode 100644 index 0000000000..53cd8a908b --- /dev/null +++ b/storage_controller/src/safekeeper.rs @@ -0,0 +1,147 @@ +use std::{str::FromStr, time::Duration}; + +use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use reqwest::StatusCode; +use safekeeper_client::mgmt_api; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId, logging::SecretString}; + +use crate::{ + heartbeater::SafekeeperState, + persistence::{DatabaseError, SafekeeperPersistence}, + safekeeper_client::SafekeeperClient, +}; + +#[derive(Clone)] +pub struct Safekeeper { + pub(crate) skp: SafekeeperPersistence, + cancel: CancellationToken, + listen_http_addr: String, + listen_http_port: u16, + scheduling_policy: SkSchedulingPolicy, + id: NodeId, + availability: SafekeeperState, +} + +impl Safekeeper { + pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { + let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); + Self { + cancel, + listen_http_addr: skp.host.clone(), + listen_http_port: skp.http_port as u16, + id: NodeId(skp.id as u64), + skp, + availability: SafekeeperState::Offline, + scheduling_policy, + } + } + pub(crate) fn base_url(&self) -> String { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } + + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + pub(crate) fn describe_response(&self) -> Result { + self.skp.as_describe_response() + } + pub(crate) fn set_availability(&mut self, availability: SafekeeperState) { + self.availability = availability; + } + pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy { + self.scheduling_policy + } + pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { + self.scheduling_policy = scheduling_policy; + self.skp.scheduling_policy = String::from(scheduling_policy); + } + /// Perform an operation (which is given a [`SafekeeperClient`]) with retries + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> mgmt_api::Result + where + O: FnMut(SafekeeperClient) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = SafekeeperClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.clone(), + ); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to safekeeper {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + .unwrap_or(Err(mgmt_api::Error::Cancelled)) + } + + pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) { + let crate::persistence::SafekeeperUpsert { + active: _, + availability_zone_id: _, + host, + http_port, + id, + port: _, + region_id: _, + version: _, + } = record.clone(); + if id != self.id.0 as i64 { + // The way the function is called ensures this. If we regress on that, it's a bug. + panic!( + "id can't be changed via update_from_record function: {id} != {}", + self.id.0 + ); + } + self.skp = + crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); + self.listen_http_port = http_port as u16; + self.listen_http_addr = host; + } +} diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs new file mode 100644 index 0000000000..f234ab3429 --- /dev/null +++ b/storage_controller/src/safekeeper_client.rs @@ -0,0 +1,121 @@ +use crate::metrics::PageserverRequestLabelGroup; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; +use safekeeper_client::mgmt_api::{Client, Result}; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + logging::SecretString, +}; + +/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +/// +/// Analogous to [`crate::pageserver_client::PageserverClient`]. +#[derive(Debug, Clone)] +pub(crate) struct SafekeeperClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl SafekeeperClient { + #[allow(dead_code)] + pub(crate) fn new( + node_id: NodeId, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + #[allow(dead_code)] + pub(crate) async fn create_timeline( + &self, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "create_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.create_timeline(req).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "delete_timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.delete_timeline(tenant_id, timeline_id).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn pull_timeline( + &self, + req: &PullTimelineRequest, + ) -> Result { + measured_request!( + "pull_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.pull_timeline(req).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.utilization().await + ) + } +} diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 6829663a4c..fc6d2f3d29 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2,6 +2,7 @@ pub mod chaos_injector; mod context_iterator; use hyper::Uri; +use safekeeper_api::models::SafekeeperUtilization; use std::{ borrow::Cow, cmp::Ordering, @@ -20,6 +21,7 @@ use crate::{ }, compute_hook::{self, NotifyError}, drain_utils::{self, TenantShardDrain, TenantShardIterator}, + heartbeater::SafekeeperState, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, leadership::Leadership, metrics, @@ -28,7 +30,11 @@ use crate::{ AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, ShardGenerationState, TenantFilter, }, - reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, + reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, + ReconcilerPriority, + }, + safekeeper::Safekeeper, scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, tenant_shard::{ MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, @@ -76,7 +82,7 @@ use pageserver_api::{ }, }; use pageserver_client::{mgmt_api, BlockUnblock}; -use tokio::sync::mpsc::error::TrySendError; +use tokio::sync::{mpsc::error::TrySendError, TryAcquireError}; use tokio_util::sync::CancellationToken; use utils::{ completion::Barrier, @@ -156,6 +162,7 @@ enum TenantOperations { TimelineDetachAncestor, TimelineGcBlockUnblock, DropDetached, + DownloadHeatmapLayers, } #[derive(Clone, strum_macros::Display)] @@ -192,6 +199,7 @@ pub(crate) enum LeadershipStatus { } pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; +pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -206,6 +214,8 @@ struct ServiceState { nodes: Arc>, + safekeepers: Arc>, + scheduler: Scheduler, /// Ongoing background operation on the cluster if any is running. @@ -272,6 +282,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { impl ServiceState { fn new( nodes: HashMap, + safekeepers: HashMap, tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, @@ -283,6 +294,7 @@ impl ServiceState { leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), + safekeepers: Arc::new(safekeepers), scheduler, ongoing_operation: None, delayed_reconcile_rx, @@ -299,6 +311,23 @@ impl ServiceState { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + #[allow(clippy::type_complexity)] + fn parts_mut_sk( + &mut self, + ) -> ( + &mut Arc>, + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + ( + &mut self.nodes, + &mut self.safekeepers, + &mut self.tenants, + &mut self.scheduler, + ) + } + fn get_leadership_status(&self) -> LeadershipStatus { self.leadership_status } @@ -342,9 +371,12 @@ pub struct Config { /// and/or upon handling the re-attach request from a node. pub max_warming_up_interval: Duration, - /// How many Reconcilers may be spawned concurrently + /// How many normal-priority Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, + /// How many high-priority Reconcilers may be spawned concurrently + pub priority_reconciler_concurrency: usize, + /// How large must a shard grow in bytes before we split it? /// None disables auto-splitting. pub split_threshold: Option, @@ -397,7 +429,8 @@ pub struct Service { compute_hook: Arc, result_tx: tokio::sync::mpsc::UnboundedSender, - heartbeater: Heartbeater, + heartbeater_ps: Heartbeater, + heartbeater_sk: Heartbeater, // Channel for background cleanup from failed operations that require cleanup, such as shard split abort_tx: tokio::sync::mpsc::UnboundedSender, @@ -411,9 +444,14 @@ pub struct Service { // that transition it to/from Active. node_op_locks: IdLockMap, - // Limit how many Reconcilers we will spawn concurrently + // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations + // and reconciliation on startup. reconciler_concurrency: Arc, + // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which + // a human user might be waiting for. + priority_reconciler_concurrency: Arc, + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile /// Send into this queue to promptly attempt to reconcile this shard next time units are available. /// @@ -607,7 +645,8 @@ impl Service { let locked = self.inner.read().unwrap(); locked.nodes.clone() }; - let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; + let (mut nodes_online, mut sks_online) = + self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -616,7 +655,7 @@ impl Service { tracing::info!("Populating tenant shards' states from initial pageserver scan..."); let shard_count = { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (nodes, safekeepers, tenants, scheduler) = locked.parts_mut_sk(); // Mark nodes online if they responded to us: nodes are offline by default after a restart. let mut new_nodes = (**nodes).clone(); @@ -628,6 +667,17 @@ impl Service { } *nodes = Arc::new(new_nodes); + let mut new_sks = (**safekeepers).clone(); + for (node_id, node) in new_sks.iter_mut() { + if let Some((utilization, last_seen_at)) = sks_online.remove(node_id) { + node.set_availability(SafekeeperState::Available { + utilization, + last_seen_at, + }); + } + } + *safekeepers = Arc::new(new_sks); + for (tenant_shard_id, observed_state) in observed.0 { let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { for node_id in observed_state.locations.keys() { @@ -736,7 +786,10 @@ impl Service { async fn initial_heartbeat_round<'a>( &self, node_ids: impl Iterator, - ) -> HashMap { + ) -> ( + HashMap, + HashMap, + ) { assert!(!self.startup_complete.is_ready()); let all_nodes = { @@ -756,14 +809,21 @@ impl Service { } } + let all_sks = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + tracing::info!("Sending initial heartbeats..."); - let res = self - .heartbeater - .heartbeat(Arc::new(nodes_to_heartbeat)) - .await; + // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime + const SK_TIMEOUT: Duration = Duration::from_secs(5); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + ); let mut online_nodes = HashMap::new(); - if let Ok(deltas) = res { + if let Ok(deltas) = res_ps { for (node_id, status) in deltas.0 { match status { PageserverState::Available { utilization, .. } => { @@ -777,7 +837,22 @@ impl Service { } } - online_nodes + let mut online_sks = HashMap::new(); + if let Ok(Ok(deltas)) = res_sk { + for (node_id, status) in deltas.0 { + match status { + SafekeeperState::Available { + utilization, + last_seen_at, + } => { + online_sks.insert(node_id, (utilization, last_seen_at)); + } + SafekeeperState::Offline => {} + } + } + } + + (online_nodes, online_sks) } /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. @@ -957,12 +1032,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - let optimizations = self.optimize_all().await; - if optimizations == 0 { - // Run new splits only when no optimizations are pending - self.autosplit_tenants().await; - } + self.optimize_all().await; } + // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we + // must be responsive when new projects begin ingesting and reach the threshold. + self.autosplit_tenants().await; } _ = self.reconcilers_cancel.cancelled() => return } @@ -984,8 +1058,18 @@ impl Service { locked.nodes.clone() }; - let res = self.heartbeater.heartbeat(nodes).await; - if let Ok(deltas) = res { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + const SK_TIMEOUT: Duration = Duration::from_secs(3); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(nodes), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + ); + + if let Ok(deltas) = res_ps { let mut to_handle = Vec::default(); for (node_id, state) in deltas.0 { @@ -1086,6 +1170,18 @@ impl Service { } } } + if let Ok(Ok(deltas)) = res_sk { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + for (id, state) in deltas.0 { + let Some(sk) = safekeepers.get_mut(&id) else { + tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"); + continue; + }; + sk.set_availability(state); + } + locked.safekeepers = Arc::new(safekeepers); + } } } @@ -1184,12 +1280,15 @@ impl Service { } // Maybe some other work can proceed now that this job finished. + // + // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these + // reconciles are scheduled at `[ReconcilerPriority::Normal]`). if self.reconciler_concurrency.available_permits() > 0 { while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { let (nodes, tenants, _scheduler) = locked.parts_mut(); if let Some(shard) = tenants.get_mut(&tenant_shard_id) { shard.delayed_reconcile = false; - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } if self.reconciler_concurrency.available_permits() == 0 { @@ -1311,6 +1410,17 @@ impl Service { .storage_controller_pageserver_nodes .set(nodes.len() as i64); + tracing::info!("Loading safekeepers from database..."); + let safekeepers = persistence + .list_safekeepers() + .await? + .into_iter() + .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new())) + .collect::>(); + let safekeepers: HashMap = + safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; tracing::info!( @@ -1437,7 +1547,14 @@ impl Service { let cancel = CancellationToken::new(); let reconcilers_cancel = cancel.child_token(); - let heartbeater = Heartbeater::new( + let heartbeater_ps = Heartbeater::new( + config.jwt_token.clone(), + config.max_offline_interval, + config.max_warming_up_interval, + cancel.clone(), + ); + + let heartbeater_sk = Heartbeater::new( config.jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, @@ -1453,6 +1570,7 @@ impl Service { let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, + safekeepers, tenants, scheduler, delayed_reconcile_rx, @@ -1462,10 +1580,14 @@ impl Service { persistence, compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, - heartbeater, + heartbeater_ps, + heartbeater_sk, reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( config.reconciler_concurrency, )), + priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.priority_reconciler_concurrency, + )), delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), @@ -2238,7 +2360,7 @@ impl Service { let waiters = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); - let config = ReconcilerConfigBuilder::new() + let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High) .tenant_creation_hint(true) .build(); tenants @@ -2713,7 +2835,8 @@ impl Service { shard.schedule(scheduler, &mut schedule_context)?; - let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + let maybe_waiter = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); if let Some(waiter) = maybe_waiter { waiters.push(waiter); } @@ -2834,7 +2957,9 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { shard.config = config.clone(); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -3116,7 +3241,9 @@ impl Service { debug_assert!(shard.intent.get_attached().is_none()); debug_assert!(shard.intent.get_secondary().is_empty()); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { detach_waiters.push(waiter); } } @@ -3268,7 +3395,7 @@ impl Service { // In case scheduling is being switched back on, try it now. shard.schedule(scheduler, &mut schedule_context).ok(); - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } Ok(()) @@ -3635,6 +3762,61 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::DownloadHeatmapLayers, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = if tenant_shard_id.is_unsharded() { + TenantShardId::tenant_range(tenant_shard_id.tenant_id) + } else { + tenant_shard_id.range() + }; + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + self.tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + Ok(()) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -4317,7 +4499,7 @@ impl Service { tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } // We don't expect any new_shard_count shards to exist here, but drop them just in case @@ -4483,7 +4665,11 @@ impl Service { tracing::warn!("Failed to schedule child shard {child}: {e}"); } // In the background, attach secondary locations for the new shards - if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + if let Some(waiter) = self.maybe_reconcile_shard( + &mut child_state, + nodes, + ReconcilerPriority::High, + ) { waiters.push(waiter); } @@ -4848,7 +5034,9 @@ impl Service { shard.intent.clear_secondary(scheduler); // Run Reconciler to execute detach fo secondary locations. - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -5114,7 +5302,12 @@ impl Service { shard.sequence = shard.sequence.next(); } - self.maybe_reconcile_shard(shard, nodes) + let reconciler_config = match migrate_req.migration_config { + Some(cfg) => (&cfg).into(), + None => ReconcilerConfig::new(ReconcilerPriority::High), + }; + + self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config) }; if let Some(waiter) = waiter { @@ -5177,7 +5370,7 @@ impl Service { ); } - self.maybe_reconcile_shard(shard, nodes) + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) }; if let Some(waiter) = waiter { @@ -5589,7 +5782,7 @@ impl Service { ) } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } // Here we remove an existing observed location for the node we're removing, and it will @@ -5958,7 +6151,14 @@ impl Service { tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { - if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { + if self + .maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ) + .is_some() + { tenants_affected += 1; }; } @@ -5989,7 +6189,11 @@ impl Service { if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_shard, nodes); + self.maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ); } } } @@ -6353,8 +6557,36 @@ impl Service { &self, shard: &mut TenantShard, nodes: &Arc>, + priority: ReconcilerPriority, ) -> Option { - self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority)) + } + + /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority) + fn get_reconciler_units( + &self, + priority: ReconcilerPriority, + ) -> Result { + let units = match priority { + ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(), + ReconcilerPriority::High => { + match self + .priority_reconciler_concurrency + .clone() + .try_acquire_owned() + { + Ok(u) => Ok(u), + Err(TryAcquireError::NoPermits) => { + // If the high priority semaphore is exhausted, then high priority tasks may steal units from + // the normal priority semaphore. + self.reconciler_concurrency.clone().try_acquire_owned() + } + Err(e) => Err(e), + } + } + }; + + units.map(ReconcileUnits::new) } /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], @@ -6374,8 +6606,8 @@ impl Service { } }; - let units = match self.reconciler_concurrency.clone().try_acquire_owned() { - Ok(u) => ReconcileUnits::new(u), + let units = match self.get_reconciler_units(reconciler_config.priority) { + Ok(u) => u, Err(_) => { tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), "Concurrency limited: enqueued for reconcile later"); @@ -6468,7 +6700,10 @@ impl Service { // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another rone - if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + if self + .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } else if shard.delayed_reconcile { // Shard wanted to reconcile but for some reason couldn't. @@ -6554,7 +6789,10 @@ impl Service { tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; - if self.maybe_reconcile_shard(shard, nodes).is_some() { + if self + .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } } @@ -7104,7 +7342,7 @@ impl Service { // to not stall the operation when a cold secondary is encountered. const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -7437,7 +7675,7 @@ impl Service { ) -> Result<(), OperationError> { const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -7661,29 +7899,54 @@ impl Service { pub(crate) async fn safekeepers_list( &self, ) -> Result, DatabaseError> { - self.persistence - .list_safekeepers() - .await? - .into_iter() - .map(|v| v.as_describe_response()) - .collect::, _>>() + let locked = self.inner.read().unwrap(); + let mut list = locked + .safekeepers + .iter() + .map(|sk| sk.1.describe_response()) + .collect::, _>>()?; + list.sort_by_key(|v| v.id); + Ok(list) } pub(crate) async fn get_safekeeper( &self, id: i64, ) -> Result { - self.persistence - .safekeeper_get(id) - .await - .and_then(|v| v.as_describe_response()) + let locked = self.inner.read().unwrap(); + let sk = locked + .safekeepers + .get(&NodeId(id as u64)) + .ok_or(diesel::result::Error::NotFound)?; + sk.describe_response() } pub(crate) async fn upsert_safekeeper( &self, record: crate::persistence::SafekeeperUpsert, ) -> Result<(), DatabaseError> { - self.persistence.safekeeper_upsert(record).await + let node_id = NodeId(record.id as u64); + self.persistence.safekeeper_upsert(record.clone()).await?; + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + match safekeepers.entry(node_id) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.get_mut().update_from_record(record); + } + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + )); + } + } + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) } pub(crate) async fn set_safekeeper_scheduling_policy( @@ -7693,7 +7956,20 @@ impl Service { ) -> Result<(), DatabaseError> { self.persistence .set_safekeeper_scheduling_policy(id, scheduling_policy) - .await + .await?; + let node_id = NodeId(id as u64); + // After the change has been persisted successfully, update the in-memory state + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.set_scheduling_policy(scheduling_policy); + + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) } pub(crate) async fn update_shards_preferred_azs( diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 91d7183fde..aa0ee0df5a 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -88,7 +88,11 @@ impl ChaosInjector { shard.intent.demote_attached(scheduler, old_location); shard.intent.promote_attached(scheduler, new_location); - self.service.maybe_reconcile_shard(shard, nodes); + self.service.maybe_reconcile_shard( + shard, + nodes, + crate::reconciler::ReconcilerPriority::Normal, + ); } async fn inject_chaos(&mut self) { diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py index 33248132ab..d674be99de 100644 --- a/test_runner/fixtures/fast_import.py +++ b/test_runner/fixtures/fast_import.py @@ -4,8 +4,10 @@ import subprocess import tempfile from collections.abc import Iterator from pathlib import Path +from typing import cast import pytest +from _pytest.config import Config from fixtures.log_helper import log from fixtures.neon_cli import AbstractNeonCli @@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli): pg_distrib_dir: Path, pg_version: PgVersion, workdir: Path, + cleanup: bool = True, ): if extra_env is None: env_vars = {} @@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli): if not workdir.exists(): raise Exception(f"Working directory '{workdir}' does not exist") self.workdir = workdir + self.cleanup = cleanup + + def run_pgdata( + self, + s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + interactive: bool = False, + ): + return self.run( + "pgdata", + s3prefix=s3prefix, + pg_port=pg_port, + source_connection_string=source_connection_string, + interactive=interactive, + ) + + def run_dump_restore( + self, + s3prefix: str | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, + ): + return self.run( + "dump-restore", + s3prefix=s3prefix, + source_connection_string=source_connection_string, + destination_connection_string=destination_connection_string, + ) def run( self, - pg_port: int, - source_connection_string: str | None = None, + command: str, s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, interactive: bool = False, ) -> subprocess.CompletedProcess[str]: if self.cmd is not None: @@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli): args = [ f"--pg-bin-dir={self.pg_bin}", f"--pg-lib-dir={self.pg_lib}", - f"--pg-port={pg_port}", f"--working-directory={self.workdir}", ] - if source_connection_string is not None: - args.append(f"--source-connection-string={source_connection_string}") if s3prefix is not None: args.append(f"--s3-prefix={s3prefix}") + args.append(command) + if pg_port is not None: + args.append(f"--pg-port={pg_port}") + if source_connection_string is not None: + args.append(f"--source-connection-string={source_connection_string}") + if destination_connection_string is not None: + args.append(f"--destination-connection-string={destination_connection_string}") if interactive: args.append("--interactive") @@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli): return self def __exit__(self, *args): - if self.workdir.exists(): + if self.workdir.exists() and self.cleanup: shutil.rmtree(self.workdir) @@ -87,9 +125,17 @@ def fast_import( test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, + pytestconfig: Config, ) -> Iterator[FastImport]: - workdir = Path(tempfile.mkdtemp()) - with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi: + workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_")) + with FastImport( + None, + neon_binpath, + pg_distrib_dir, + pg_version, + workdir, + cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")), + ) as fi: yield fi if fi.cmd is None: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2fa82754ef..58c5dbfd29 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3,6 +3,7 @@ from __future__ import annotations import abc import asyncio import concurrent.futures +import dataclasses import filecmp import json import os @@ -26,6 +27,7 @@ from urllib.parse import quote, urlparse import asyncpg import backoff +import boto3 import httpx import psycopg2 import psycopg2.sql @@ -36,6 +38,8 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from jwcrypto import jwk +from mypy_boto3_kms import KMSClient +from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -92,7 +96,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, USE_LFC, - allure_add_grafana_links, + allure_add_grafana_link, assert_no_errors, get_dir_size, print_gc_result, @@ -198,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: mock_s3_server.kill() +@pytest.fixture(scope="session") +def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]: + yield boto3.client( + "kms", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + +@pytest.fixture(scope="session") +def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]: + yield boto3.client( + "s3", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + class PgProtocol: """Reusable connection logic""" @@ -463,6 +491,7 @@ class NeonEnvBuilder: self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: assert ( self.compatibility_neon_binpath is not None @@ -674,6 +703,11 @@ class NeonEnvBuilder: def _mix_versions(self): assert self.version_combination is not None, "version combination must be set" + + # Always use a newer version of `neon_local` + (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local") + self.neon_local_binpath = self.mixdir + for component, paths in COMPONENT_BINARIES.items(): directory = ( self.neon_binpath @@ -682,10 +716,11 @@ class NeonEnvBuilder: ) for filename in paths: destination = self.mixdir / filename - destination.symlink_to(directory / filename) + destination.hardlink_to(directory / filename) + self.neon_binpath = self.mixdir + if self.version_combination["compute"] == "old": self.pg_distrib_dir = self.compatibility_pg_distrib_dir - self.neon_binpath = self.mixdir def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ @@ -1675,6 +1710,12 @@ class StorageControllerLeadershipStatus(StrEnum): CANDIDATE = "candidate" +@dataclass +class StorageControllerMigrationConfig: + secondary_warmup_timeout: str | None + secondary_download_request_timeout: str | None + + class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env @@ -2068,11 +2109,20 @@ class NeonStorageController(MetricsGetter, LogUtils): shards: list[TenantShardId] = body["new_shards"] return shards - def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): + def tenant_shard_migrate( + self, + tenant_shard_id: TenantShardId, + dest_ps_id: int, + config: StorageControllerMigrationConfig | None = None, + ): + payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id} + if config is not None: + payload["migration_config"] = dataclasses.asdict(config) + self.request( "PUT", f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", - json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + json=payload, headers=self.headers(TokenScope.ADMIN), ) log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") @@ -2417,6 +2467,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + def __enter__(self) -> Self: return self @@ -3197,7 +3255,7 @@ def remote_pg( end_ms = int(datetime.utcnow().timestamp() * 1000) if is_neon: # Add 10s margin to the start and end times - allure_add_grafana_links( + allure_add_grafana_link( host, timeline_id, start_ms - 10_000, @@ -4972,8 +5030,13 @@ def check_restored_datadir_content( restored_files = list_files_to_compare(restored_dir_path) + # pg_notify files are always ignored + pgdata_files = [f for f in pgdata_files if not f.startswith("pg_notify")] + restored_files = [f for f in restored_files if not f.startswith("pg_notify")] + + # pg_xact and pg_multixact files are optional in basebackup: depending on our configuration they + # may be omitted and loaded on demand. if pgdata_files != restored_files: - # filter pg_xact and multixact files which are downloaded on demand pgdata_files = [ f for f in pgdata_files diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index e160c617cd..84d62fb877 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -52,11 +52,11 @@ COMPONENT_BINARIES = { # Disable auto-formatting for better readability # fmt: off VERSIONS_COMBINATIONS = ( - {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo ) # fmt: on @@ -64,6 +64,8 @@ VERSIONS_COMBINATIONS = ( # If it is not set or set to a value not equal to "false", LFC is enabled by default. USE_LFC = os.environ.get("USE_LFC") != "false" +WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled" + def subprocess_capture( capture_dir: Path, @@ -310,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): GRAFANA_URL = "https://neonprod.grafana.net" -GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore" -GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector" -LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz" +GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging" -def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): - """Add links to server logs in Grafana to Allure report""" - links: dict[str, str] = {} - # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build +def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): + """ + Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report. + + Args: + host (str): The host string in the format 'ep-..'. + timeline_id (TimelineId): The timeline identifier for the Grafana dashboard. + (currently ignored but may be needed in future verions of the dashboard) + start_ms (int): The start time in milliseconds for the Grafana dashboard. + end_ms (int): The end time in milliseconds for the Grafana dashboard. + + Example: + Given + host = '' + timeline_id = '996926d1f5ddbe7381b8840083f8fc9a' + + The generated link would be something like: + https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi + + """ + # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) - expressions = { - "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', - "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', - "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', - "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + params = { + "orgId": 1, + "from": start_ms, + "to": end_ms, + "timezone": "utc", + "var-env": "dev|staging", + "var-input_endpoint_id": endpoint_id, } - params: dict[str, Any] = { - "datasource": LOGS_STAGING_DATASOURCE_ID, - "queries": [ - { - "expr": "", - "refId": "A", - "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID}, - "editorMode": "code", - "queryType": "range", - } - ], - "range": { - "from": str(start_ms), - "to": str(end_ms), - }, - } - for name, expr in expressions.items(): - params["queries"][0]["expr"] = expr - query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) - links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}" + query_string = urlencode(params) + link = f"{GRAFANA_DASHBOARD_URL}?{query_string}" - timeline_qs = urlencode( - { - "orgId": 1, - "var-environment": "victoria-metrics-aws-dev", - "var-timeline_id": timeline_id, - "var-endpoint_id": endpoint_id, - "var-log_datasource": "grafanacloud-neonstaging-logs", - "from": start_ms, - "to": end_ms, - } - ) - link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}" - links["Timeline Inspector"] = link - - for name, link in links.items(): - allure.dynamic.link(link, name=name) - log.info(f"{name}: {link}") + allure.dynamic.link(link, name="Cross-Service Endpoint Debugging") + log.info(f"Cross-Service Endpoint Debugging: {link}") def start_in_background( diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f10872590c..c091cd0869 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) @@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent( workload.churn_rows(row_count, env.pageserver.id) env.create_branch("child_branch") # so that we have a retain_lsn workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch_2") # so that we have another retain_lsn + workload.churn_rows(row_count, env.pageserver.id) # compact 3 times if mode is before_restart n_compactions = 3 if compaction_mode == "before_restart" else 1 ps_http.timeline_compact( @@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) @@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively, # and the second one should have hit the duplicated layer key warning. @@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index 0217cd0d03..03bfd1cb8d 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -2,6 +2,8 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import PgVersion +from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres @pytest.mark.parametrize( @@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") with pytest.raises(Exception, match="This probably means the server terminated abnormally"): endpoint.safe_psql(f"SELECT {sql_func}();") + + +@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only") +def test_sanitizers(neon_env_builder: NeonEnvBuilder): + """ + Test that undefined behavior leads to endpoint abort with sanitizers enabled + """ + env = neon_env_builder.init_start() + env.create_branch("test_ubsan") + endpoint = env.endpoints.create_start("test_ubsan") + + # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org + if not WITH_SANITIZERS: + endpoint.safe_psql("SELECT 1::int4 << 128") + else: + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql("SELECT 1::int4 << 128") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 6b35f3c6bb..71e0d16edd 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -1,7 +1,9 @@ +import base64 import json import re import time from enum import Enum +from pathlib import Path import psycopg2 import psycopg2.errors @@ -14,8 +16,12 @@ from fixtures.pageserver.http import ( ImportPgdataIdemptencyKey, PageserverApiException, ) +from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from mypy_boto3_kms import KMSClient +from mypy_boto3_kms.type_defs import EncryptResponseTypeDef +from mypy_boto3_s3 import S3Client from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -103,13 +109,15 @@ def test_pgdata_import_smoke( while True: relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") log.info( - f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages" ) if relblock_size >= target_relblock_size: break addrows = int((target_relblock_size - relblock_size) // 8192) assert addrows >= 1, "forward progress" - vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + vanilla_pg.safe_psql( + f"insert into t select generate_series({nrows + 1}, {nrows + addrows})" + ) nrows += addrows expect_nrows = nrows expect_sum = ( @@ -231,14 +239,14 @@ def test_pgdata_import_smoke( shard_zero_http = shard_zero_ps.http_client() shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) - latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + min_readable_lsn = Lsn(shard_zero_timeline_info["min_readable_lsn"]) last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` assert remote_consistent_lsn_visible == disk_consistent_lsn - assert initdb_lsn == latest_gc_cutoff_lsn + assert initdb_lsn == min_readable_lsn assert disk_consistent_lsn == initdb_lsn + 8 assert last_record_lsn == disk_consistent_lsn # TODO: assert these values are the same everywhere @@ -332,6 +340,224 @@ def test_pgdata_import_smoke( br_initdb_endpoint.safe_psql("select * from othertable") +def test_fast_import_with_pageserver_ingest( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, + neon_env_builder: NeonEnvBuilder, + make_httpserver: HTTPServer, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Setup pageserver and fake cplane for import progress + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api", + # because import_pgdata code uses this endpoint, not the one in common remote storage config + # TODO: maybe use common remote_storage config in pageserver? + "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(), + } + ) + env.pageserver.stop() + env.pageserver.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "project_id": "someproject", + "branch_id": "somebranch", + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Create timeline with import_pgdata + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": { + "AwsS3": { + "region": env.s3_mock_server.region(), + "bucket": bucket, + "key": key_prefix, + } + }, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + pg_port = port_distributor.get_port() + fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") + vanilla_pg.stop() + + def validate_vanilla_equivalence(ep): + res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb") + assert res[0] == (10, 55), f"got result: {res}" + + # Sanity check that data in pgdata is expected: + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + validate_vanilla_equivalence(conn) + + # Poll pageserver statuses in s3 + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}") + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + if state == "Active": + key = f"{key_prefix}/status/shard-{shard_id.shard_index}" + shard_status_file_contents = ( + mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8") + ) + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(0.5) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id) + + # check that data is there + validate_vanilla_equivalence(ep) + + # check that we can do basic ops + + ep.safe_psql("create table othertable(values text)", dbname="neondb") + rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + ep.stop() + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_tip_endpoint.stop() + + # ... at the initdb lsn + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero["shard_id"], timeline_id + ) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_initdb_endpoint.stop() + + env.pageserver.stop(immediate=True) + + def test_fast_import_binary( test_output_dir, vanilla_pg: VanillaPostgres, @@ -342,7 +568,7 @@ def test_fast_import_binary( vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") pg_port = port_distributor.get_port() - fast_import.run(pg_port, vanilla_pg.connstr()) + fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) vanilla_pg.stop() pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) @@ -358,6 +584,118 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_restore_to_connstring( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # create another database & role and try to restore there + destination_vanilla_pg.safe_psql(""" + CREATE ROLE testrole WITH + LOGIN + PASSWORD 'testpassword' + NOSUPERUSER + NOCREATEDB + NOCREATEROLE; + """) + destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;") + + destination_connstring = destination_vanilla_pg.connstr( + dbname="testdb", user="testrole", password="testpassword" + ) + fast_import.run_dump_restore( + source_connection_string=vanilla_pg.connstr(), + destination_connection_string=destination_connstring, + ) + vanilla_pg.stop() + conn = PgProtocol(dsn=destination_connstring) + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +def test_fast_import_restore_to_connstring_from_s3_spec( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Start target postgres + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "destination_connstring_ciphertext_base64": base64.b64encode( + destination_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + } + + mock_s3_client.create_bucket(Bucket="test-bucket") + mock_s3_client.put_object( + Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec) + ) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix") + vanilla_pg.stop() + + res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + # TODO: Maybe test with pageserver? # 1. run whole neon env # 2. create timeline with some s3 path??? diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 3ac4ed1a3e..872d3dc4cf 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -20,9 +20,6 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import query_scalar, wait_until -@pytest.mark.skip( - reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548" -) @pytest.mark.parametrize( "attach_mode", ["default_generation", "same_generation"], diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 8762e6525b..ea7d38a3d9 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): thread.join() + # Fill LFC: seqscan should fetch the whole table in cache. + # It is needed for further correct evaluation of LFC file size + # (a sparse chunk of LFC takes less than 1 MB on disk). + cur.execute("select sum(abalance) from pgbench_accounts") + # Before shrinking the cache, check that it really is large now (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert int(lfc_file_blocks) > 128 * 1024 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 590093d23c..602d493ae6 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -10,14 +10,18 @@ from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserver, + StorageControllerMigrationConfig, +) from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage -from fixtures.utils import skip_in_debug_build, wait_until +from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until from fixtures.workload import Workload from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -889,3 +893,106 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll assert progress_3["heatmap_mtime"] is not None assert progress_3["layers_total"] == progress_3["layers_downloaded"] assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] + + +@skip_in_debug_build("only run with release build") +@run_only_on_default_postgres("PG version is not interesting here") +def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + + tenant_conf = TENANT_CONF.copy() + tenant_conf["heatmap_period"] = "0s" + + env = neon_env_builder.init_configs() + env.start() + + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}') + + env.storage_controller.reconcile_until_idle() + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.stop() + + # Expect lots of layers + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. + # However, it pulls the heatmap, which will be important later. + http_client = env.storage_controller.pageserver_api() + (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress["heatmap_mtime"] is not None + assert progress["layers_downloaded"] > 0 + assert progress["bytes_downloaded"] > 0 + assert progress["layers_total"] > progress["layers_downloaded"] + assert progress["bytes_total"] > progress["bytes_downloaded"] + + env.storage_controller.allowed_errors.extend( + [ + ".*Timed out.*downloading layers.*", + ] + ) + + # Use a custom configuration that gives up earlier than usual. + # We can't hydrate everything anyway because of the failpoints. + config = StorageControllerMigrationConfig( + secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config + ) + + env.storage_controller.reconcile_until_idle() + assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + + assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0 + + # The new layer map should contain all the layers in the pre-migration one + # and a new in memory layer + after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"]) + assert ( + len(heatmap_before_migration["timelines"][0]["layers"]) + 1 + == after_migration_heatmap_layers_count + ) + + log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + + def all_layers_downloaded(): + local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == after_migration_heatmap_layers_count + + wait_until(all_layers_downloaded) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index c5ae669dce..411888efbc 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -261,7 +261,7 @@ def test_isolation( pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) # This fails with a mismatch on `pg_multixact/offsets/0000` - # post_checks(env, test_output_dir, DBNAME, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run extra Neon-specific pg_regress-based tests. The tests and their diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index c13bea7ee1..fe970a868c 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -287,7 +287,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): offset=offset, ) - # Do some update so we can increment latest_gc_cutoff + # Do some update so we can increment gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) # Wait for the existing lease to expire. diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py new file mode 100644 index 0000000000..3e29c92a96 --- /dev/null +++ b/test_runner/regress/test_relations.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) + + +def test_pageserver_reldir_v2( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "false", + } + ) + + endpoint = env.endpoints.create_start("main") + # Create a relation in v1 + endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + + # Switch to v2 + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": True, + }, + ) + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Restart the endpoint + endpoint.stop() + endpoint.start() + + # Check if both relations are still accessible again after restart + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Create a relation in v2 + endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + # Delete a relation in v1 + endpoint.safe_psql("DROP TABLE foo1") + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + # Restart the endpoint + endpoint.stop() + # This will acquire a basebackup, which lists all relations. + endpoint.start() + + # Check if both relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + endpoint.safe_psql("DROP TABLE foo3") + endpoint.stop() + endpoint.start() + + # Check if relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("DROP TABLE IF EXISTS foo3") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 8910873690..f58bbcd3c0 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1821,7 +1821,7 @@ def test_sharding_gc( # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed ps.allowed_errors.extend( [ - ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not find data for key.*", ".*could not ingest record.*", ] ) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2750826aec..1d95312140 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3189,15 +3189,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert len(target.get_safekeepers()) == 0 + sk_0 = env.safekeepers[0] + body = { "active": True, "id": fake_id, "created_at": "2023-10-25T09:11:25Z", "updated_at": "2024-08-28T11:32:43Z", "region_id": "aws-us-east-2", - "host": "safekeeper-333.us-east-2.aws.neon.build", - "port": 6401, - "http_port": 7676, + "host": "localhost", + "port": sk_0.port.pg, + "http_port": sk_0.port.http, "version": 5957, "availability_zone_id": "us-east-2b", } @@ -3236,11 +3238,26 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): newest_info = target.get_safekeeper(inserted["id"]) assert newest_info assert newest_info["scheduling_policy"] == "Pause" - target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + target.safekeeper_scheduling_policy(inserted["id"], "Active") newest_info = target.get_safekeeper(inserted["id"]) assert newest_info - assert newest_info["scheduling_policy"] == "Decomissioned" + assert newest_info["scheduling_policy"] == "Active" # Ensure idempotency + target.safekeeper_scheduling_policy(inserted["id"], "Active") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + # change back to paused again + target.safekeeper_scheduling_policy(inserted["id"], "Pause") + + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) + + # Now decomission it target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index b8253fb125..d44c176b35 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -318,7 +318,7 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed ps.allowed_errors.extend( [ - ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not find data for key.*", ".*could not ingest record.*", ] ) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index b4c968b217..afe444f227 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): counts = timeline_detail["directory_entries_counts"] assert counts log.info(f"directory counts: {counts}") - assert counts[2] > COUNT_AT_LEAST_EXPECTED + # We need to add up reldir v1 + v2 counts + assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2b6a267bdf..c5045fe4a4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -566,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): +# This test is flaky, probably because PUTs of local fs storage are not atomic. +# Let's keep both remote storage kinds for a while to see if this is the case. +# https://github.com/neondatabase/neon/issues/10761 +@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS]) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -1441,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # roughly fills one segment endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'") + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) endpoint.stop() # stop compute @@ -1469,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): "flush_lsn to get aligned", ) - cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) + sk1_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + sk2_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + assert sk1_digest == sk2_digest # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c0aedfd3ca..6254ab9b44 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679 +Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 355a7c69d3..81e2eef061 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59 +Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 13cf5d06c9..9422247c58 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792 +Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 4c45d78ad5..a8fea8b4be 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d +Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8 diff --git a/vendor/revisions.json b/vendor/revisions.json index 5f60e1d690..72d97d7f6a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.2", - "4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d" + "17.4", + "a8fea8b4be43039f0782347c88a9b9b25f50c9d8" ], "v16": [ - "16.6", - "13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792" + "16.8", + "9422247c582e7c1a08a4855d04af0874f8df2f34" ], "v15": [ - "15.10", - "355a7c69d3f907f3612eb406cc7b9c2f55d59b59" + "15.12", + "81e2eef0616c65c2233c75b06f25766ae4c080c4" ], "v14": [ - "14.15", - "c0aedfd3cac447510a2db843b561f0c52901b679" + "14.17", + "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" ] }