diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 2b96ce95da..1e6c2d0aa2 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -28,3 +28,7 @@ config-variables: - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN - SLACK_ON_CALL_STORAGE_STAGING_STREAM - SLACK_CICD_CHANNEL_ID + - SLACK_STORAGE_CHANNEL_ID + - NEON_DEV_AWS_ACCOUNT_ID + - NEON_PROD_AWS_ACCOUNT_ID + - AWS_ECR_REGION diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index c9f6b0832e..a393aa6106 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -19,7 +19,11 @@ inputs: default: '[1, 1]' # settings below only needed if you want the project to be sharded from the beginning shard_split_project: - description: 'by default new projects are not shard-split, specify true to shard-split' + description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially' + required: false + default: 'false' + disable_sharding: + description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding' required: false default: 'false' admin_api_key: @@ -107,6 +111,21 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" fi + if [ "${DISABLE_SHARDING}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" + echo "with body {\"scheduling\": \"Essential\"}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"scheduling\": \"Essential\"}" + fi env: API_HOST: ${{ inputs.api_host }} @@ -116,6 +135,7 @@ runs: MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} + DISABLE_SHARDING: ${{ inputs.disable_sharding }} ADMIN_API_KEY: ${{ inputs.admin_api_key }} SHARD_COUNT: ${{ inputs.shard_count }} STRIPE_SIZE: ${{ inputs.stripe_size }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 86a791497c..3740e6dc9c 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -348,6 +348,10 @@ jobs: rerun_failed: true pg_version: ${{ matrix.pg_version }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. + # Attempt to stop tests gracefully to generate test reports + # until they are forcibly stopped by the stricter `timeout-minutes` limit. + extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 3c97c8a67a..c938f62ad5 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -2,7 +2,7 @@ name: Push images to Container Registry on: workflow_call: inputs: - # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} image-map: description: JSON map of images, mapping from a source image to an array of target images that should be pushed. required: true diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bc773600ea..8f3392ceea 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -68,7 +68,7 @@ jobs: tag: needs: [ check-permissions ] runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -859,14 +859,17 @@ jobs: BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" + DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + AWS_REGION: "${{ vars.AWS_ECR_REGION }}" push-neon-image-dev: needs: [ generate-image-maps, neon-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -881,8 +884,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' - aws-region: eu-central-1 - aws-account-ids: "369495373322" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -898,8 +901,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -915,8 +918,8 @@ jobs: uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' - aws-region: eu-central-1 - aws-account-ids: "093970136003" + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} @@ -1029,7 +1032,7 @@ jobs: statuses: write contents: write runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - uses: actions/checkout@v4 @@ -1178,6 +1181,22 @@ jobs: exit 1 fi + notify-storage-release-deploy-failure: + needs: [ deploy ] + # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. + if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + runs-on: ubuntu-22.04 + steps: + - name: Post release-deploy failure to team-storage slack channel + uses: slackapi/slack-github-action@v2 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + text: | + 🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ deploy ] @@ -1274,7 +1293,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ] + needs: [ build-build-tools-image, test-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index 2bc938509f..e40b02b5d2 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -27,7 +27,7 @@ env: jobs: tag: runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml new file mode 100644 index 0000000000..71c5158ef6 --- /dev/null +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -0,0 +1,76 @@ +name: Force Test Upgrading of Extension +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '45 2 * * *' # run once a day, timezone is utc + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow + group: ${{ github.workflow }} + cancel-in-progress: true + +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read + +jobs: + regress: + strategy: + fail-fast: false + matrix: + pg-version: [16, 17] + + runs-on: small + + steps: + - uses: actions/checkout@v4 + with: + submodules: false + + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + + - name: Test extension upgrade + timeout-minutes: 20 + env: + NEWTAG: latest + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + PG_VERSION: ${{ matrix.pg-version }} + FORCE_ALL_UPGRADE_TESTS: true + run: ./docker-compose/test_extensions_upgrade.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Post to the Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} + slack-message: | + Test upgrading of extensions: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 7b303fa37a..c20c5890f9 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -32,18 +32,27 @@ jobs: - target_project: new_empty_project_stripe_size_2048 stripe_size: 2048 # 16 MiB postgres_version: 16 + disable_sharding: false - target_project: new_empty_project_stripe_size_32768 stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold # while here it is sharded from the beginning with a shard size of 256 MiB + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 16 - target_project: new_empty_project stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false postgres_version: 17 - target_project: large_existing_project stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project_unsharded + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: true postgres_version: 16 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: @@ -96,6 +105,7 @@ jobs: admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} shard_count: 8 stripe_size: ${{ matrix.stripe_size }} + disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 626de2b0e0..b305b662ee 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -33,10 +33,6 @@ concurrency: # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned - jobs: check-manifests: runs-on: ubuntu-22.04 @@ -46,11 +42,14 @@ jobs: steps: - name: Check if we really need to pin the image id: check-manifests + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned run: | - docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json - docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" - if diff ${FROM_TAG}.json ${TO_TAG}.json; then + if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true else skip=false @@ -64,55 +63,34 @@ jobs: # use format(..) to catch both inputs.force = true AND inputs.force = 'true' if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' - runs-on: ubuntu-22.04 - permissions: - id-token: write # for `azure/login` and aws auth + id-token: write # Required for aws/azure login - steps: - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - run: | - az acr login --name=neoneastus2 - - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR - env: - DEFAULT_DEBIAN_VERSION: bookworm - run: | - for debian_version in bullseye bookworm; do - tags=() - - tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") - - if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${TO_TAG}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") - fi - - docker buildx imagetools create "${tags[@]}" \ - neondatabase/build-tools:${FROM_TAG}-${debian_version} - done + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "docker.io/neondatabase/build-tools:pinned-bullseye", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" + ], + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "docker.io/neondatabase/build-tools:pinned-bookworm", + "docker.io/neondatabase/build-tools:pinned", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned" + ] + } + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml new file mode 100644 index 0000000000..1e9d2ec5e2 --- /dev/null +++ b/.github/workflows/regenerate-pg-setting.yml @@ -0,0 +1,41 @@ +name: Regenerate Postgres Settings + +on: + pull_request: + types: + - opened + - synchronize + - reopened + paths: + - pgxn/neon/**.c + - vendor/postgres-v* + - vendor/revisions.json + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +permissions: + pull-requests: write + +jobs: + regenerate-pg-settings: + runs-on: ubuntu-22.04 + + steps: + - name: Add comment + uses: thollander/actions-comment-pull-request@v3 + with: + comment-tag: ${{ github.job }} + pr-number: ${{ github.event.number }} + message: | + If this PR added a GUC in the Postgres fork or `neon` extension, + please regenerate the Postgres settings in the `cloud` repo: + + ``` + make NEON_WORKDIR=path/to/neon/checkout \ + -C goapp/internal/shareddomain/postgres generate + ``` + + If you're an external contributor, a Neon employee will assist in + making sure this step is done. diff --git a/Cargo.lock b/Cargo.lock index 74922d71c9..f0dbdff3ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -786,7 +786,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-trait", "base64 0.22.1", @@ -815,7 +815,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-lock", "async-trait", @@ -834,7 +834,7 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "async-lock", @@ -852,7 +852,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "azure_core", @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "azure_core", "bytes", @@ -1029,12 +1029,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "boxcar" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42" - [[package]] name = "bstr" version = "1.5.0" @@ -1309,6 +1303,7 @@ dependencies = [ "aws-config", "aws-sdk-kms", "aws-sdk-s3", + "aws-smithy-types", "axum", "base64 0.13.1", "bytes", @@ -1321,7 +1316,6 @@ dependencies = [ "flate2", "futures", "http 1.1.0", - "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -1331,7 +1325,6 @@ dependencies = [ "opentelemetry_sdk", "postgres", "postgres_initdb", - "prometheus", "regex", "remote_storage", "reqwest", @@ -1350,13 +1343,13 @@ dependencies = [ "tower 0.5.2", "tower-http", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", "utils", "uuid", "vm_monitor", + "walkdir", "workspace_hack", "zstd", ] @@ -1881,6 +1874,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -3338,6 +3337,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "9.2.0" @@ -4929,7 +4939,6 @@ dependencies = [ "aws-sdk-iam", "aws-sigv4", "base64 0.13.1", - "boxcar", "bstr", "bytes", "camino", @@ -4981,7 +4990,6 @@ dependencies = [ "postgres-protocol2", "postgres_backend", "pq_proto", - "prometheus", "rand 0.8.5", "rand_distr", "rcgen", @@ -5006,7 +5014,6 @@ dependencies = [ "smallvec", "smol_str", "socket2", - "strum", "strum_macros", "subtle", "thiserror 1.0.69", @@ -5021,7 +5028,6 @@ dependencies = [ "tracing", "tracing-log", "tracing-opentelemetry", - "tracing-serde", "tracing-subscriber", "tracing-utils", "try-lock", @@ -6454,6 +6460,7 @@ dependencies = [ "humantime", "hyper 0.14.30", "itertools 0.10.5", + "json-structural-diff", "lasso", "measured", "metrics", @@ -6476,6 +6483,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -7029,14 +7037,11 @@ dependencies = [ name = "tokio-postgres2" version = "0.1.0" dependencies = [ - "async-trait", - "byteorder", "bytes", "fallible-iterator", "futures-util", "log", "parking_lot 0.12.1", - "percent-encoding", "phf", "pin-project-lite", "postgres-protocol2", @@ -7623,13 +7628,13 @@ dependencies = [ "hex", "hex-literal", "humantime", - "inferno 0.12.0", "jsonwebtoken", "metrics", "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", diff --git a/Cargo.toml b/Cargo.toml index 7228623c6b..21310ce6ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -210,6 +210,7 @@ rustls-native-certs = "0.8" x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } +json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing env_logger = "0.10" diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 317eded26e..c103ceaea5 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.84.1 +ENV RUSTC_VERSION=1.85.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 30348c2b90..7169cbc41d 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \ + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* @@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control +######################################################################################### +# +# Layer "pg-duckdb-pg-build" +# compile pg_duckdb extension +# +######################################################################################### +FROM build-deps AS pg_duckdb-src +WORKDIR /ext-src +COPY compute/patches/pg_duckdb_v031.patch . +# pg_duckdb build requires source dir to be a git repo to get submodules +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# - extension management function duckdb.install_extension() +# - access to duckdb.extensions table and its sequence +RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ + cd pg_duckdb-src && \ + git submodule update --init --recursive && \ + patch -p1 < /ext-src/pg_duckdb_v031.patch + +FROM pg-build AS pg_duckdb-build +ARG PG_VERSION +COPY --from=pg_duckdb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_duckdb-src +RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1484,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install + +######################################################################################### +# +# Layer "pgaudit" +# compile pgaudit extension +# +######################################################################################### + +FROM build-deps AS pgaudit-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14") \ + export PGAUDIT_VERSION=1.6.2 \ + export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + ;; \ + "v15") \ + export PGAUDIT_VERSION=1.7.0 \ + export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + ;; \ + "v16") \ + export PGAUDIT_VERSION=16.0 \ + export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + ;; \ + "v17") \ + export PGAUDIT_VERSION=17.0 \ + export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + ;; \ + *) \ + echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ + echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgaudit-build +COPY --from=pgaudit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgaudit-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + +######################################################################################### +# +# Layer "pgauditlogtofile" +# compile pgauditlogtofile extension +# +######################################################################################### + +FROM build-deps AS pgauditlogtofile-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16" | "v17") \ + export PGAUDITLOGTOFILE_VERSION=v1.6.4 \ + export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \ + ;; \ + *) \ + echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \ + echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \ + mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgauditlogtofile-build +COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgauditlogtofile-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + ######################################################################################### # # Layer "neon-ext-build" @@ -1577,7 +1669,14 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ + +# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake +# also depends on libduckdb, but a different version. +#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ + COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ ######################################################################################### # @@ -1669,29 +1768,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\ && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - -######################################################################################### -# -# Layer "awscli" -# -######################################################################################### -FROM build-deps AS awscli -ARG TARGETARCH -RUN set -ex; \ - if [ "${TARGETARCH}" = "amd64" ]; then \ - TARGETARCH_ALT="x86_64"; \ - CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ - elif [ "${TARGETARCH}" = "arm64" ]; then \ - TARGETARCH_ALT="aarch64"; \ - CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ - else \ - echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ - fi; \ - curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ - echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ - unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ - /tmp/awscliv2/aws/install; \ - rm -rf /tmp/awscliv2.zip /tmp/awscliv2 - ######################################################################################### # # Clean up postgres folder before inclusion @@ -1772,14 +1848,20 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ #COPY --from=pg_mooncake-src /ext-src/ /ext-src/ -#COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/pg_repack.patch /ext-src +RUN cd /ext-src/pg_repack-src && patch -p1 OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + INFO: repacking table "public.tbl_cluster" + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + ERROR: pg_repack failed with error: You must be a superuser to use pg_repack + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + ERROR: pg_repack failed with error: ERROR: permission denied for schema repack + LINE 1: select repack.version(), repack.version_sql() + ^ + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + INFO: repacking table "public.tbl_cluster" + ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block + DETAIL: query was: RESET lock_timeout +diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql +index 072f0fa..dbe60f8 100644 +--- a/regress/sql/nosuper.sql ++++ b/regress/sql/nosuper.sql +@@ -4,19 +4,19 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper; + REVOKE USAGE ON SCHEMA repack FROM nosuper; diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 124c40cf5d..6617c98599 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -47,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b8828fa49f..c276996df5 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -14,6 +14,7 @@ base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true +aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } camino.workspace = true @@ -24,7 +25,6 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true -jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -47,13 +47,12 @@ tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true tracing.workspace = true -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true uuid.workspace = true -prometheus.workspace = true +walkdir.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a8803ec793..1cdae718fe 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -41,7 +41,6 @@ use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; -use std::time::SystemTime; use std::{thread, time::Duration}; use anyhow::{Context, Result}; @@ -86,19 +85,6 @@ fn parse_remote_ext_config(arg: &str) -> Result { } } -/// Generate a compute ID if one is not supplied. This exists to keep forward -/// compatibility tests working, but will be removed in a future iteration. -fn generate_compute_id() -> String { - let now = SystemTime::now(); - - format!( - "compute-{}", - now.duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs() - ) -} - #[derive(Parser)] #[command(rename_all = "kebab-case")] struct Cli { @@ -112,16 +98,13 @@ struct Cli { /// outside the compute will talk to the compute through this port. Keep /// the previous name for this argument around for a smoother release /// with the control plane. - /// - /// TODO: Remove the alias after the control plane release which teaches the - /// control plane about the renamed argument. - #[arg(long, alias = "http-port", default_value_t = 3080)] + #[arg(long, default_value_t = 3080)] pub external_http_port: u16, - /// The port to bind the internal listening HTTP server to. Clients like + /// The port to bind the internal listening HTTP server to. Clients include /// the neon extension (for installing remote extensions) and local_proxy. - #[arg(long)] - pub internal_http_port: Option, + #[arg(long, default_value_t = 3081)] + pub internal_http_port: u16, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -156,7 +139,7 @@ struct Cli { #[arg(short = 'S', long, group = "spec-path")] pub spec_path: Option, - #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())] + #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] @@ -359,7 +342,7 @@ fn wait_spec( pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, - internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1), + internal_http_port: cli.internal_http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -383,7 +366,7 @@ fn wait_spec( // The internal HTTP server could be launched later, but there isn't much // sense in waiting. - Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute); + Server::Internal(cli.internal_http_port).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 27cf1c2317..585f3e4e1d 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,10 +25,10 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::Context; +use anyhow::{bail, Context}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; -use clap::Parser; +use clap::{Parser, Subcommand}; use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; use nix::unistd::Pid; use tracing::{error, info, info_span, warn, Instrument}; @@ -44,32 +44,59 @@ mod s3_uri; const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); +#[derive(Subcommand, Debug)] +enum Command { + /// Runs local postgres (neon binary), restores into it, + /// uploads pgdata to s3 to be consumed by pageservers + Pgdata { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// If specified, will not shut down the local postgres after the import. Used in local testing + #[clap(short, long)] + interactive: bool, + /// Port to run postgres on. Default is 5432. + #[clap(long, default_value_t = 5432)] + pg_port: u16, // port to run postgres on, 5432 is default + + /// Number of CPUs in the system. This is used to configure # of + /// parallel worker processes, for index creation. + #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] + num_cpus: Option, + + /// Amount of RAM in the system. This is used to configure shared_buffers + /// and maintenance_work_mem. + #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] + memory_mb: Option, + }, + + /// Runs pg_dump-pg_restore from source to destination without running local postgres. + DumpRestore { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// Raw connection string to the destination database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + destination_connection_string: Option, + }, +} + #[derive(clap::Parser)] struct Args { - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_WORKDIR")] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] s3_prefix: Option, - #[clap(long)] - source_connection_string: Option, - #[clap(short, long)] - interactive: bool, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")] pg_bin_dir: Utf8PathBuf, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")] pg_lib_dir: Utf8PathBuf, - #[clap(long)] - pg_port: Option, // port to run postgres on, 5432 is default - /// Number of CPUs in the system. This is used to configure # of - /// parallel worker processes, for index creation. - #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] - num_cpus: Option, - - /// Amount of RAM in the system. This is used to configure shared_buffers - /// and maintenance_work_mem. - #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] - memory_mb: Option, + #[clap(subcommand)] + command: Command, } #[serde_with::serde_as] @@ -78,6 +105,8 @@ struct Spec { encryption_secret: EncryptionSecret, #[serde_as(as = "serde_with::base64::Base64")] source_connstring_ciphertext_base64: Vec, + #[serde_as(as = "Option")] + destination_connstring_ciphertext_base64: Option>, } #[derive(serde::Deserialize)] @@ -93,192 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { "C.UTF-8" }; -#[tokio::main] -pub(crate) async fn main() -> anyhow::Result<()> { - utils::logging::init( - utils::logging::LogFormat::Plain, - utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, - utils::logging::Output::Stdout, - )?; - - info!("starting"); - - let args = Args::parse(); - - // Validate arguments - if args.s3_prefix.is_none() && args.source_connection_string.is_none() { - anyhow::bail!("either s3_prefix or source_connection_string must be specified"); - } - if args.s3_prefix.is_some() && args.source_connection_string.is_some() { - anyhow::bail!("only one of s3_prefix or source_connection_string can be specified"); - } - - let working_directory = args.working_directory; - let pg_bin_dir = args.pg_bin_dir; - let pg_lib_dir = args.pg_lib_dir; - let pg_port = args.pg_port.unwrap_or_else(|| { - info!("pg_port not specified, using default 5432"); - 5432 - }); - - // Initialize AWS clients only if s3_prefix is specified - let (aws_config, kms_client) = if args.s3_prefix.is_some() { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let kms = aws_sdk_kms::Client::new(&config); - (Some(config), Some(kms)) - } else { - (None, None) - }; - - // Get source connection string either from S3 spec or direct argument - let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix { - let spec: Spec = { - let spec_key = s3_prefix.append("/spec.json"); - let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap()); - let object = s3_client - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? - }; - - match spec.encryption_secret { - EncryptionSecret::KMS { key_id } => { - let mut output = kms_client - .unwrap() - .decrypt() - .key_id(key_id) - .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( - spec.source_connstring_ciphertext_base64, - )) - .send() - .await - .context("decrypt source connection string")?; - let plaintext = output - .plaintext - .take() - .context("get plaintext source connection string")?; - String::from_utf8(plaintext.into_inner()) - .context("parse source connection string as utf8")? - } - } - } else { - args.source_connection_string.unwrap() - }; - - match tokio::fs::create_dir(&working_directory).await { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - if !is_directory_empty(&working_directory) - .await - .context("check if working directory is empty")? - { - anyhow::bail!("working directory is not empty"); - } else { - // ok - } - } - Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), - } - - let pgdata_dir = working_directory.join("pgdata"); - tokio::fs::create_dir(&pgdata_dir) +async fn decode_connstring( + kms_client: &aws_sdk_kms::Client, + key_id: &String, + connstring_ciphertext_base64: Vec, +) -> Result { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + connstring_ciphertext_base64, + )) + .send() .await - .context("create pgdata directory")?; + .context("decrypt connection string")?; - let pgbin = pg_bin_dir.join("postgres"); - let pg_version = match get_pg_version(pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; - let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded - postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { - superuser, - locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, - pg_version, - initdb_bin: pg_bin_dir.join("initdb").as_ref(), - library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. - pgdata: &pgdata_dir, - }) - .await - .context("initdb")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext connection string")?; - // If the caller didn't specify CPU / RAM to use for sizing, default to - // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM. - let nproc = args.num_cpus.unwrap_or_else(num_cpus::get); - let memory_mb = args.memory_mb.unwrap_or(256); + String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8") +} - // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for - // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest - // available for misc other stuff that PostgreSQL uses memory for. - let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; - let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; +struct PostgresProcess { + pgdata_dir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pgbin: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + postgres_proc: Option, +} - // - // Launch postgres process - // - let mut postgres_proc = tokio::process::Command::new(pgbin) - .arg("-D") - .arg(&pgdata_dir) - .args(["-p", &format!("{pg_port}")]) - .args(["-c", "wal_level=minimal"]) - .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) - .args(["-c", "max_wal_senders=0"]) - .args(["-c", "fsync=off"]) - .args(["-c", "full_page_writes=off"]) - .args(["-c", "synchronous_commit=off"]) - .args([ - "-c", - &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), - ]) - .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) - .args(["-c", &format!("max_worker_processes={nproc}")]) - .args([ - "-c", - &format!( - "effective_io_concurrency={}", - if cfg!(target_os = "macos") { 0 } else { 100 } - ), - ]) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir) - .env( - "ASAN_OPTIONS", - std::env::var("ASAN_OPTIONS").unwrap_or_default(), +impl PostgresProcess { + fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self { + Self { + pgdata_dir, + pgbin: pg_bin_dir.join("postgres"), + pg_bin_dir, + pg_lib_dir, + postgres_proc: None, + } + } + + async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> { + tokio::fs::create_dir(&self.pgdata_dir) + .await + .context("create pgdata directory")?; + + let pg_version = match get_pg_version(self.pgbin.as_ref()) { + PostgresMajorVersion::V14 => 14, + PostgresMajorVersion::V15 => 15, + PostgresMajorVersion::V16 => 16, + PostgresMajorVersion::V17 => 17, + }; + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: initdb_user, + locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, + pg_version, + initdb_bin: self.pg_bin_dir.join("initdb").as_ref(), + library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &self.pgdata_dir, + }) + .await + .context("initdb") + } + + async fn start( + &mut self, + initdb_user: &str, + port: u16, + nproc: usize, + memory_mb: usize, + ) -> Result<&tokio::process::Child, anyhow::Error> { + self.prepare(initdb_user).await?; + + // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for + // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest + // available for misc other stuff that PostgreSQL uses memory for. + let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; + let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; + + // + // Launch postgres process + // + let mut proc = tokio::process::Command::new(&self.pgbin) + .arg("-D") + .arg(&self.pgdata_dir) + .args(["-p", &format!("{port}")]) + .args(["-c", "wal_level=minimal"]) + .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args([ + "-c", + &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), + ]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .env("LD_LIBRARY_PATH", &self.pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take()) + .instrument(info_span!("postgres")), + ); + + self.postgres_proc = Some(proc); + Ok(self.postgres_proc.as_ref().unwrap()) + } + + async fn shutdown(&mut self) -> Result<(), anyhow::Error> { + let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap(); + info!("shutdown postgres"); + nix::sys::signal::kill( + Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")), + nix::sys::signal::SIGTERM, ) - .env( - "UBSAN_OPTIONS", - std::env::var("UBSAN_OPTIONS").unwrap_or_default(), - ) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("spawn postgres")?; - - info!("spawned postgres, waiting for it to become ready"); - tokio::spawn( - child_stdio_to_log::relay_process_output( - postgres_proc.stdout.take(), - postgres_proc.stderr.take(), - ) - .instrument(info_span!("postgres")), - ); + .context("signal postgres to shut down")?; + proc.wait() + .await + .context("wait for postgres to shut down") + .map(|_| ()) + } +} +async fn wait_until_ready(connstring: String, create_dbname: String) { // Create neondb database in the running postgres - let restore_pg_connstring = - format!("host=localhost port={pg_port} user={superuser} dbname=postgres"); - let start_time = std::time::Instant::now(); loop { @@ -289,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { std::process::exit(1); } - match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await { + match tokio_postgres::connect( + &connstring.replace("dbname=neondb", "dbname=postgres"), + tokio_postgres::NoTls, + ) + .await + { Ok((client, connection)) => { // Spawn the connection handling task to maintain the connection tokio::spawn(async move { @@ -298,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { } }); - match client.simple_query("CREATE DATABASE neondb;").await { + match client + .simple_query(format!("CREATE DATABASE {create_dbname};").as_str()) + .await + { Ok(_) => { - info!("created neondb database"); + info!("created {} database", create_dbname); break; } Err(e) => { @@ -324,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> { } } } +} - let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb"); - - let dumpdir = working_directory.join("dumpdir"); +async fn run_dump_restore( + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + source_connstring: String, + destination_connstring: String, +) -> Result<(), anyhow::Error> { + let dumpdir = workdir.join("dumpdir"); let common_args = [ // schema mapping (prob suffices to specify them on one side) @@ -356,10 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg("--no-sync") // POSITIONAL args // source db (db name included in connection string) - .arg(&source_connection_string) + .arg(&source_connstring) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -376,24 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_dump.wait().await.context("wait for pg_dump")?; info!(status=?st, "pg_dump exited"); if !st.success() { - warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + error!(status=%st, "pg_dump failed, restore will likely fail as well"); + bail!("pg_dump failed"); } } - // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: maybe do it in a streaming way, plenty of internal research done on this already // TODO: do the unlogged table trick - - info!("restore from working directory into vanilla postgres"); { let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) .args(&common_args) .arg("-d") - .arg(&restore_pg_connstring) + .arg(&destination_connstring) // POSITIONAL args .arg(&dumpdir) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -411,48 +427,259 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_restore.wait().await.context("wait for pg_restore")?; info!(status=?st, "pg_restore exited"); if !st.success() { - warn!(status=%st, "pg_restore failed, restore will likely fail as well"); - } - } - - // If interactive mode, wait for Ctrl+C - if args.interactive { - info!("Running in interactive mode. Press Ctrl+C to shut down."); - tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; - } - - info!("shutdown postgres"); - { - nix::sys::signal::kill( - Pid::from_raw( - i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), - ), - nix::sys::signal::SIGTERM, - ) - .context("signal postgres to shut down")?; - postgres_proc - .wait() - .await - .context("wait for postgres to shut down")?; - } - - // Only sync if s3_prefix was specified - if let Some(s3_prefix) = args.s3_prefix { - info!("upload pgdata"); - aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) - .await - .context("sync dump directory to destination")?; - - info!("write status"); - { - let status_dir = working_directory.join("status"); - std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("pgdata"); - std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) - .context("write status file")?; - aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) - .await - .context("sync status directory to destination")?; + error!(status=%st, "pg_restore failed, restore will likely fail as well"); + bail!("pg_restore failed"); + } + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +async fn cmd_pgdata( + s3_client: Option, + kms_client: Option, + maybe_s3_prefix: Option, + maybe_spec: Option, + source_connection_string: Option, + interactive: bool, + pg_port: u16, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + num_cpus: Option, + memory_mb: Option, +) -> Result<(), anyhow::Error> { + if maybe_spec.is_none() && source_connection_string.is_none() { + bail!("spec must be provided for pgdata command"); + } + if maybe_spec.is_some() && source_connection_string.is_some() { + bail!("only one of spec or source_connection_string can be provided"); + } + + let source_connection_string = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await? + } + } + } else { + source_connection_string.unwrap() + }; + + let superuser = "cloud_admin"; + let destination_connstring = format!( + "host=localhost port={} user={} dbname=neondb", + pg_port, superuser + ); + + let pgdata_dir = workdir.join("pgdata"); + let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); + let nproc = num_cpus.unwrap_or_else(num_cpus::get); + let memory_mb = memory_mb.unwrap_or(256); + proc.start(superuser, pg_port, nproc, memory_mb).await?; + wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await; + + run_dump_restore( + workdir.clone(), + pg_bin_dir, + pg_lib_dir, + source_connection_string, + destination_connstring, + ) + .await?; + + // If interactive mode, wait for Ctrl+C + if interactive { + info!("Running in interactive mode. Press Ctrl+C to shut down."); + tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; + } + + proc.shutdown().await?; + + // Only sync if s3_prefix was specified + if let Some(s3_prefix) = maybe_s3_prefix { + info!("upload pgdata"); + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + Utf8Path::new(&pgdata_dir), + &s3_prefix.append("/pgdata/"), + ) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = workdir.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("pgdata"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + &status_dir, + &s3_prefix.append("/status/"), + ) + .await + .context("sync status directory to destination")?; + } + } + + Ok(()) +} + +async fn cmd_dumprestore( + kms_client: Option, + maybe_spec: Option, + source_connection_string: Option, + destination_connection_string: Option, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, +) -> Result<(), anyhow::Error> { + let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let source = decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await?; + + let dest = if let Some(dest_ciphertext) = + spec.destination_connstring_ciphertext_base64 + { + decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) + .await? + } else { + bail!("destination connection string must be provided in spec for dump_restore command"); + }; + + (source, dest) + } + } + } else { + ( + source_connection_string.unwrap(), + if let Some(val) = destination_connection_string { + val + } else { + bail!("destination connection string must be provided for dump_restore command"); + }, + ) + }; + + run_dump_restore( + workdir, + pg_bin_dir, + pg_lib_dir, + source_connstring, + destination_connstring, + ) + .await +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Json, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let args = Args::parse(); + + // Initialize AWS clients only if s3_prefix is specified + let (s3_client, kms_client) = if args.s3_prefix.is_some() { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let s3_client = aws_sdk_s3::Client::new(&config); + let kms = aws_sdk_kms::Client::new(&config); + (Some(s3_client), Some(kms)) + } else { + (None, None) + }; + + let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { + let spec_key = s3_prefix.append("/spec.json"); + let object = s3_client + .as_ref() + .unwrap() + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + } else { + None + }; + + match tokio::fs::create_dir(&args.working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&args.working_directory) + .await + .context("check if working directory is empty")? + { + bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + match args.command { + Command::Pgdata { + source_connection_string, + interactive, + pg_port, + num_cpus, + memory_mb, + } => { + cmd_pgdata( + s3_client, + kms_client, + args.s3_prefix, + spec, + source_connection_string, + interactive, + pg_port, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + num_cpus, + memory_mb, + ) + .await?; + } + Command::DumpRestore { + source_connection_string, + destination_connection_string, + } => { + cmd_dumprestore( + kms_client, + spec, + source_connection_string, + destination_connection_string, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + ) + .await?; } } diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 5fa58c8f87..1be10b36d6 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,24 +1,102 @@ -use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::task::JoinSet; +use walkdir::WalkDir; use super::s3_uri::S3Uri; -pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("aws"); - builder - .arg("s3") - .arg("sync") - .arg(local.as_str()) - .arg(remote.to_string()); - let st = builder - .spawn() - .context("spawn aws s3 sync")? - .wait() - .await - .context("wait for aws s3 sync")?; - if st.success() { - Ok(()) - } else { - Err(anyhow::anyhow!("aws s3 sync failed")) +use tracing::{info, warn}; + +const MAX_PARALLEL_UPLOADS: usize = 10; + +/// Upload all files from 'local' to 'remote' +pub(crate) async fn upload_dir_recursive( + s3_client: &aws_sdk_s3::Client, + local: &Utf8Path, + remote: &S3Uri, +) -> anyhow::Result<()> { + // Recursively scan directory + let mut dirwalker = WalkDir::new(local) + .into_iter() + .map(|entry| { + let entry = entry?; + let file_type = entry.file_type(); + let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf(); + Ok((file_type, path)) + }) + .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| { + match e { + Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)), + Ok((file_type, _path)) if file_type.is_dir() => { + // The WalkDir iterator will recurse into directories, but we don't want + // to do anything with directories as such. There's no concept of uploading + // an empty directory to S3. + None + } + Ok((file_type, path)) if file_type.is_symlink() => { + // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip. + warn!("cannot upload symlink ({})", path); + None + } + Ok((_file_type, path)) => { + // should not happen + warn!("directory entry has unexpected type ({})", path); + None + } + Err(e) => Some(Err(e)), + } + }); + + // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in + // parallel. + let mut joinset = JoinSet::new(); + loop { + // Could we upload more? + while joinset.len() < MAX_PARALLEL_UPLOADS { + if let Some(full_local_path) = dirwalker.next() { + let full_local_path = full_local_path?; + let relative_local_path = full_local_path + .strip_prefix(local) + .expect("all paths start from the walkdir root"); + let remote_path = remote.append(relative_local_path.as_str()); + info!( + "starting upload of {} to {}", + &full_local_path, &remote_path + ); + let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path); + joinset.spawn(upload_task); + } else { + info!("draining upload tasks"); + break; + } + } + + // Wait for an upload to complete + if let Some(res) = joinset.join_next().await { + let _ = res?; + } else { + // all done! + break; + } } + Ok(()) +} + +pub(crate) async fn upload_file( + s3_client: aws_sdk_s3::Client, + local_path: Utf8PathBuf, + remote: S3Uri, +) -> anyhow::Result<()> { + use aws_smithy_types::byte_stream::ByteStream; + let stream = ByteStream::from_path(&local_path).await?; + + let _result = s3_client + .put_object() + .bucket(remote.bucket) + .key(&remote.key) + .body(stream) + .send() + .await?; + info!("upload of {} to {} finished", &local_path, &remote.key); + + Ok(()) } diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql index dfb925e48e..03e8e158fa 100644 --- a/compute_tools/src/sql/drop_subscriptions.sql +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -2,6 +2,7 @@ DO $$ DECLARE subname TEXT; BEGIN + LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index c3c8229c38..407578abb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -46,6 +46,8 @@ use std::process::Command; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; use anyhow::{anyhow, bail, Context, Result}; use compute_api::requests::ConfigurationRequest; @@ -59,6 +61,7 @@ use nix::sys::signal::Signal; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; use serde::{Deserialize, Serialize}; +use tracing::debug; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; @@ -81,8 +84,10 @@ pub struct EndpointConf { internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + reconfigure_concurrency: usize, drop_subscriptions_before_start: bool, features: Vec, + cluster: Option, } // @@ -179,7 +184,9 @@ impl ComputeControlPlane { // we also skip catalog updates in the cloud. skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, }); ep.create_endpoint_dir()?; @@ -196,7 +203,9 @@ impl ComputeControlPlane { pg_version, skip_pg_catalog_updates, drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, })?, )?; std::fs::write( @@ -261,8 +270,11 @@ pub struct Endpoint { skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, + reconfigure_concurrency: usize, // Feature flags features: Vec, + // Cluster settings + cluster: Option, } #[derive(PartialEq, Eq)] @@ -302,6 +314,8 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; + debug!("serialized endpoint conf: {:?}", conf); + Ok(Endpoint { pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), external_http_address: SocketAddr::new( @@ -319,8 +333,10 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + reconfigure_concurrency: conf.reconfigure_concurrency, drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, + cluster: conf.cluster, }) } @@ -607,7 +623,7 @@ impl Endpoint { }; // Create spec file - let spec = ComputeSpec { + let mut spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, @@ -640,7 +656,7 @@ impl Endpoint { Vec::new() }, settings: None, - postgresql_conf: Some(postgresql_conf), + postgresql_conf: Some(postgresql_conf.clone()), }, delta_operations: None, tenant_id: Some(self.tenant_id), @@ -653,9 +669,35 @@ impl Endpoint { pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, - reconfigure_concurrency: 1, + reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }); + spec.cluster.databases.push(Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } + let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -673,18 +715,14 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - //cmd.args([ - // "--external-http-port", - // &self.external_http_address.port().to_string(), - //]) - //.args([ - // "--internal-http-port", - // &self.internal_http_address.port().to_string(), - //]) cmd.args([ - "--http-port", + "--external-http-port", &self.external_http_address.port().to_string(), ]) + .args([ + "--internal-http-port", + &self.internal_http_address.port().to_string(), + ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) .args([ @@ -701,20 +739,16 @@ impl Endpoint { ]) // TODO: It would be nice if we generated compute IDs with the same // algorithm as the real control plane. - // - // TODO: Add this back when - // https://github.com/neondatabase/neon/pull/10747 is merged. - // - //.args([ - // "--compute-id", - // &format!( - // "compute-{}", - // SystemTime::now() - // .duration_since(UNIX_EPOCH) - // .unwrap() - // .as_secs() - // ), - //]) + .args([ + "--compute-id", + &format!( + "compute-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + ), + ]) .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 28d130d9e0..2bf89b7bfa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -335,13 +335,21 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings + .remove("checkpoint_timeout") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_period: settings + .remove("compaction_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) @@ -387,7 +395,10 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.remove("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) @@ -403,13 +414,20 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, - pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + pitr_interval: settings.remove("pitr_interval") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) @@ -427,8 +445,14 @@ impl PageServerNode { .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") - .map(|x| x.to_string()), - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, + heatmap_period: settings + .remove("heatmap_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) @@ -439,10 +463,15 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length: settings.remove("lsn_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 83faf6b4af..40b86e4110 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -22,7 +22,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, @@ -47,6 +47,9 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + listen_https_port: Option, + #[arg(long)] availability_zone_id: String, }, @@ -239,6 +242,19 @@ enum Command { #[arg(long)] scheduling_policy: SkSchedulingPolicyArg, }, + /// Downloads any missing heatmap layers for all shard for a given timeline + DownloadHeatmapLayers { + /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified, + /// the operation is performed on all shards. When a sharded tenant ID is + /// specified, the operation is only performed on the specified shard. + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + timeline_id: TimelineId, + /// Optional: Maximum download concurrency (default is 16) + #[arg(long)] + concurrency: Option, + }, } #[derive(Parser)] @@ -381,6 +397,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id, } => { storcon_client @@ -393,6 +410,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), }), ) @@ -941,7 +959,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), - heatmap_period: Some("300s".to_string()), + heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) @@ -1247,6 +1265,24 @@ async fn main() -> anyhow::Result<()> { String::from(scheduling_policy) ); } + Command::DownloadHeatmapLayers { + tenant_shard_id, + timeline_id, + concurrency, + } => { + let mut path = format!( + "/v1/tenant/{}/timeline/{}/download_heatmap_layers", + tenant_shard_id, timeline_id, + ); + + if let Some(c) = concurrency { + path = format!("{path}?concurrency={c}"); + } + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index b4f8d3d66a..9dbdcce69f 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -77,4 +77,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ + --compute-id "compute-$RANDOM" \ -S ${SPEC_FILE} diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index dd520d4986..5b3cfc74eb 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -81,15 +81,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') for d in $FAILED $CONTRIB_FAILED; do - dn="$(basename $d)" - rm -rf $dn - mkdir $dn - docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ] - docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ] - cat $dn/regression.out $dn/regression.diffs || true - rm -rf $dn + docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ] done - rm -rf $FAILED exit 1 fi fi diff --git a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh new file mode 100755 index 0000000000..5021eb4027 --- /dev/null +++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress +${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch similarity index 100% rename from docker-compose/ext-src/pg_semver-src/test-upgrade.patch rename to docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch new file mode 100644 index 0000000000..2d0bf280db --- /dev/null +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch @@ -0,0 +1,24 @@ +diff --git a/test/sql/base.sql b/test/sql/base.sql +index 53adb30..2eed91b 100644 +--- a/test/sql/base.sql ++++ b/test/sql/base.sql +@@ -2,7 +2,6 @@ + BEGIN; + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(334); + --SELECT * FROM no_plan(); +diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql +index c0fe98e..39cdd2e 100644 +--- a/test/sql/corpus.sql ++++ b/test/sql/corpus.sql +@@ -4,7 +4,6 @@ BEGIN; + -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/. + + \i test/pgtap-core.sql +-CREATE EXTENSION semver; + + SELECT plan(76); + --SELECT * FROM no_plan(); diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh index e1541f272a..18b2848fd1 100755 --- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh +++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh @@ -1,6 +1,7 @@ #!/bin/sh set -ex cd "$(dirname ${0})" -patch -p1 , pub name: Option, @@ -283,7 +283,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -292,7 +292,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -308,7 +308,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs index dd57f9ed4b..fe1cc10838 100644 --- a/libs/http-utils/src/pprof.rs +++ b/libs/http-utils/src/pprof.rs @@ -2,7 +2,6 @@ use anyhow::bail; use flate2::write::{GzDecoder, GzEncoder}; use flate2::Compression; use itertools::Itertools as _; -use once_cell::sync::Lazy; use pprof::protos::{Function, Line, Location, Message as _, Profile}; use regex::Regex; @@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result { // Resolve the line and function for each location. backtrace::resolve(loc.address as *mut c_void, |symbol| { - let Some(symname) = symbol.name() else { + let Some(symbol_name) = symbol.name() else { return; }; - let mut name = symname.to_string(); - // Strip the Rust monomorphization suffix from the symbol name. - static SUFFIX_REGEX: Lazy = - Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex")); - if let Some(m) = SUFFIX_REGEX.find(&name) { - name.truncate(m.start()); - } - - let function_id = match functions.get(&name) { - Some(function) => function.id, - None => { - let id = functions.len() as u64 + 1; - let system_name = String::from_utf8_lossy(symname.as_bytes()); + let function_name = format!("{symbol_name:#}"); + let functions_len = functions.len(); + let function_id = functions + .entry(function_name) + .or_insert_with_key(|function_name| { + let function_id = functions_len as u64 + 1; + let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); let filename = symbol .filename() .map(|path| path.to_string_lossy()) .unwrap_or(Cow::Borrowed("")); - let function = Function { - id, - name: string_id(&name), + Function { + id: function_id, + name: string_id(function_name), system_name: string_id(&system_name), filename: string_id(&filename), ..Default::default() - }; - functions.insert(name, function); - id - } - }; + } + }) + .id; loc.line.push(Line { function_id, line: symbol.lineno().unwrap_or(0) as i64, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 79f068a47b..1aff5a7012 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -122,6 +122,8 @@ pub struct ConfigToml { pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub validate_wal_contiguity: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -351,7 +353,7 @@ pub struct TenantConfigToml { /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. - pub rel_size_v2_enabled: Option, + pub rel_size_v2_enabled: bool, // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. @@ -521,6 +523,7 @@ impl Default for ConfigToml { } else { None }, + validate_wal_contiguity: None, } } } @@ -544,10 +547,11 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on - // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole - // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. - pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's + // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could + // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So + // with this config, we can get a maximum peak compaction usage of 9 GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; @@ -633,7 +637,7 @@ impl Default for TenantConfigToml { lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: true, wal_receiver_protocol_override: None, - rel_size_v2_enabled: None, + rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 42f6e47e63..f94bfab581 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -57,6 +57,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, } @@ -105,6 +106,7 @@ pub struct TenantLocateResponseShard { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, } #[derive(Serialize, Deserialize)] @@ -148,6 +150,7 @@ pub struct NodeDescribeResponse { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub listen_pg_addr: String, pub listen_pg_port: u16, diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index dbd45da314..b88a2e46a1 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,10 +1,12 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::Oid; use postgres_ffi::RepOriginId; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; +use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62; /// The key prefix of ReplOrigin keys. pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; +/// The key prefix of db directory keys. +pub const DB_DIR_KEY_PREFIX: u8 = 0x64; + +/// The key prefix of rel directory keys. +pub const REL_DIR_KEY_PREFIX: u8 = 0x65; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub enum RelDirExists { + Exists, + Removed, +} + +#[derive(Debug)] +pub struct DecodeError; + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid marker") + } +} + +impl std::error::Error for DecodeError {} + +impl RelDirExists { + /// The value of the rel directory keys that indicates the existence of a relation. + const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r"); + + pub fn encode(&self) -> Bytes { + match self { + Self::Exists => Self::REL_EXISTS_MARKER.clone(), + Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(), + } + } + + pub fn decode_option(data: Option>) -> Result { + match data { + Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists), + // Any other marker is invalid + Some(_) => Err(DecodeError), + None => Ok(Self::Removed), + } + } + + pub fn decode(data: impl AsRef<[u8]>) -> Result { + let data = data.as_ref(); + if data == Self::REL_EXISTS_MARKER { + Ok(Self::Exists) + } else if data == SPARSE_TOMBSTONE_MARKER { + Ok(Self::Removed) + } else { + Err(DecodeError) + } + } +} + +/// A tombstone in the sparse keyspace, which is an empty buffer. +pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b""); + /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX @@ -110,6 +170,24 @@ impl Key { } } + pub fn rel_dir_sparse_key_range() -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// This function checks more extensively what keys we can take on the write path. /// If a key beginning with 00 does not have a global/default tablespace OID, it /// will be rejected on the write path. @@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } +#[inline(always)] +pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: relnode, + field5: forknum, + field6: 1, + } +} + +pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + } // it's fine to exclude the last key b/c we only use field6 == 1 +} + #[inline(always)] pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { @@ -734,9 +842,9 @@ impl Key { self.field1 == RELATION_SIZE_PREFIX } - pub fn sparse_non_inherited_keyspace() -> Range { + pub const fn sparse_non_inherited_keyspace() -> Range { // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace - debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX); + const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX); Key { field1: AUX_KEY_PREFIX, field2: 0, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3d40cfe121..1164048229 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -526,9 +526,13 @@ pub struct TenantConfigPatch { #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Option, pub compaction_target_size: Option, - pub compaction_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub compaction_period: Option, pub compaction_threshold: Option, pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy @@ -539,22 +543,38 @@ pub struct TenantConfig { pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, pub gc_horizon: Option, - pub gc_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub gc_period: Option, pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, - pub heatmap_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, pub image_creation_preempt_threshold: Option, - pub lsn_lease_length: Option, - pub lsn_lease_length_for_ts: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, pub rel_size_v2_enabled: Option, @@ -564,7 +584,10 @@ pub struct TenantConfig { } impl TenantConfig { - pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig { + pub fn apply_patch( + self, + patch: TenantConfigPatch, + ) -> Result { let Self { mut checkpoint_distance, mut checkpoint_timeout, @@ -604,11 +627,17 @@ impl TenantConfig { } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); - patch.checkpoint_timeout.apply(&mut checkpoint_timeout); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); patch .compaction_target_size .apply(&mut compaction_target_size); - patch.compaction_period.apply(&mut compaction_period); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); patch .compaction_upper_limit @@ -626,15 +655,25 @@ impl TenantConfig { .apply(&mut l0_flush_stall_threshold); patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); - patch.gc_period.apply(&mut gc_period); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); patch .image_creation_threshold .apply(&mut image_creation_threshold); - patch.pitr_interval.apply(&mut pitr_interval); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); patch .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? .apply(&mut walreceiver_connect_timeout); - patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); patch.eviction_policy.apply(&mut eviction_policy); patch @@ -642,8 +681,12 @@ impl TenantConfig { .apply(&mut min_resident_size_override); patch .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? .apply(&mut evictions_low_residence_duration_metric_threshold); - patch.heatmap_period.apply(&mut heatmap_period); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); patch.lazy_slru_download.apply(&mut lazy_slru_download); patch .timeline_get_throttle @@ -654,9 +697,13 @@ impl TenantConfig { patch .image_creation_preempt_threshold .apply(&mut image_creation_preempt_threshold); - patch.lsn_lease_length.apply(&mut lsn_lease_length); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); patch @@ -673,7 +720,7 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); - Self { + Ok(Self { checkpoint_distance, checkpoint_timeout, compaction_target_size, @@ -709,7 +756,7 @@ impl TenantConfig { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, - } + }) } } @@ -1080,8 +1127,7 @@ pub struct TenantInfo { /// Opaque explanation if gc is being blocked. /// - /// Only looked up for the individual tenant detail, not the listing. This is purely for - /// debugging, not included in openapi. + /// Only looked up for the individual tenant detail, not the listing. #[serde(skip_serializing_if = "Option::is_none")] pub gc_blocking: Option, } @@ -2504,7 +2550,7 @@ mod tests { ..base.clone() }; - let patched = base.apply_patch(decoded.config); + let patched = base.apply_patch(decoded.config).unwrap(); assert_eq!(patched, expected); } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 8c024375c1..f74b229ac4 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -9,6 +9,8 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; +use std::os::fd::AsRawFd; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Poll}; @@ -268,6 +270,7 @@ impl MaybeWriteOnly { } pub struct PostgresBackend { + pub socket_fd: RawFd, framed: MaybeWriteOnly, pub state: ProtoState, @@ -293,9 +296,11 @@ impl PostgresBackend { tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; + let socket_fd = socket.as_raw_fd(); let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, @@ -307,6 +312,7 @@ impl PostgresBackend { impl PostgresBackend { pub fn new_from_io( + socket_fd: RawFd, socket: IO, peer_addr: SocketAddr, auth_type: AuthType, @@ -315,6 +321,7 @@ impl PostgresBackend { let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0239b56d9c..301bc2f16e 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -278,7 +278,7 @@ pub fn generate_pg_control( checkpoint_bytes: &[u8], lsn: Lsn, pg_version: u32, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 852b20eace..14fb1f2a1f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN +/// +/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in +/// the pageserver. They use the same format as the PostgreSQL control file and the +/// checkpoint record, but see walingest.rs for how exactly they are kept up to date. +/// 'lsn' is the LSN at which we're starting up. +/// +/// Returns: +/// - pg_control file contents +/// - system_identifier, extracted from the persisted information +/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown +/// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; // Generate new pg_control needed for bootstrap + // + // NB: In the checkpoint struct that we persist in the pageserver, we have a different + // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, + // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to + // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. + // + // We didn't always have this convention however, and old persisted records will have + // old REDO values that point to some old LSN. + // + // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown + // checkpoint record at that point in WAL, with no new WAL records after it. That case + // can be treated as starting from a clean shutdown. All other cases are treated as + // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so + // that distinction doesn't matter very much. As of this writing, it only affects + // whether the persisted pg_stats information can be used or not. + // + // In the Checkpoint struct in the returned pg_control file, the redo pointer is + // always set to the LSN we're starting at, to hint that no WAL replay is required. + // (There's some neon-specific code in Postgres startup to make that work, though. + // Just setting the redo pointer is not sufficient.) + let was_shutdown = Lsn(checkpoint.redo) == lsn; checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //save new values in pg_control + // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The + // neon-specific code at postgres startup ignores the state stored in the control + // file, similar to archive recovery in standalone PostgreSQL. Similarly, the + // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; - Ok((pg_control.encode(), pg_control.system_identifier)) + Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index ade0ffc9f6..161c6b8309 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -5,18 +5,15 @@ edition = "2021" license = "MIT/Apache-2.0" [dependencies] -async-trait.workspace = true bytes.workspace = true -byteorder.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } log = "0.4" parking_lot.workspace = true -percent-encoding = "2.0" pin-project-lite.workspace = true phf = "0.11" postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } -serde = { workspace = true, features = ["derive"] } \ No newline at end of file +serde = { workspace = true, features = ["derive"] } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 9bbbd4c260..46151ab924 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, ToSql, Type}; use crate::{ - prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, + query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, + SimpleQueryMessage, Statement, Transaction, TransactionBuilder, }; use bytes::BytesMut; use fallible_iterator::FallibleIterator; @@ -54,18 +54,18 @@ impl Responses { } /// A cache of type info and prepared statements for fetching type info -/// (corresponding to the queries in the [prepare] module). +/// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] struct CachedTypeInfo { /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). typeinfo: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). typeinfo_composite: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or /// its fallback). typeinfo_enum: Option, @@ -190,26 +190,6 @@ impl Client { &self.inner } - /// Creates a new prepared statement. - /// - /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), - /// which are set when executed. Prepared statements can only be used with the connection that created them. - pub async fn prepare(&self, query: &str) -> Result { - self.prepare_typed(query, &[]).await - } - - /// Like `prepare`, but allows the types of query parameters to be explicitly specified. - /// - /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be - /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. - pub async fn prepare_typed( - &self, - query: &str, - parameter_types: &[Type], - ) -> Result { - prepare::prepare(&self.inner, query, parameter_types).await - } - /// Executes a statement, returning a vector of the resulting rows. /// /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list @@ -222,14 +202,11 @@ impl Client { /// # Panics /// /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( + pub async fn query( &self, - statement: &T, + statement: Statement, params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> - where - T: ?Sized + ToStatement, - { + ) -> Result, Error> { self.query_raw(statement, slice_iter(params)) .await? .try_collect() @@ -250,13 +227,15 @@ impl Client { /// Panics if the number of parameters provided does not match the number expected. /// /// [`query`]: #method.query - pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + pub async fn query_raw<'a, I>( + &self, + statement: Statement, + params: I, + ) -> Result where - T: ?Sized + ToStatement, I: IntoIterator, I::IntoIter: ExactSizeIterator, { - let statement = statement.__convert().into_statement(self).await?; query::query(&self.inner, statement, params).await } @@ -271,55 +250,6 @@ impl Client { query::query_txt(&self.inner, statement, params).await } - /// Executes a statement, returning the number of rows modified. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn execute( - &self, - statement: &T, - params: &[&(dyn ToSql + Sync)], - ) -> Result - where - T: ?Sized + ToStatement, - { - self.execute_raw(statement, slice_iter(params)).await - } - - /// The maximally flexible version of [`execute`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`execute`]: #method.execute - pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result - where - T: ?Sized + ToStatement, - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - let statement = statement.__convert().into_statement(self).await?; - query::execute(self.inner(), statement, params).await - } - /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 768213f8ed..042b5a675e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,7 +1,8 @@ +#![allow(async_fn_in_trait)] + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use async_trait::async_trait; use postgres_protocol2::Oid; mod private { @@ -11,7 +12,6 @@ mod private { /// A trait allowing abstraction over connections and transactions. /// /// This trait is "sealed", and cannot be implemented outside of this crate. -#[async_trait] pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed { impl private::Sealed for Client {} -#[async_trait] impl GenericClient for Client { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where @@ -39,14 +38,12 @@ impl GenericClient for Client { /// Query for type information async fn get_type(&self, oid: Oid) -> Result { - self.get_type(oid).await + crate::prepare::get_type(self.inner(), oid).await } } impl private::Sealed for Transaction<'_> {} -#[async_trait] -#[allow(clippy::needless_lifetimes)] impl GenericClient for Transaction<'_> { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 9155dd8279..7426279167 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; -pub use crate::to_statement::ToStatement; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; @@ -65,7 +64,6 @@ pub mod row; mod simple_query; mod statement; pub mod tls; -mod to_statement; mod transaction; mod transaction_builder; pub mod types; diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index da0c755c5b..58bbb26cbc 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,7 +1,6 @@ use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::error::SqlState; use crate::types::{Field, Kind, Oid, Type}; use crate::{query, slice_iter}; use crate::{Column, Error, Statement}; @@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use std::future::Future; use std::pin::Pin; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -// Range types weren't added until Postgres 9.2, so pg_range may not exist -const TYPEINFO_FALLBACK_QUERY: &str = "\ -SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid -FROM pg_catalog.pg_type t -INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid -WHERE t.oid = $1 -"; - const TYPEINFO_ENUM_QUERY: &str = "\ SELECT enumlabel FROM pg_catalog.pg_enum @@ -39,14 +29,6 @@ WHERE enumtypid = $1 ORDER BY enumsortorder "; -// Postgres 9.0 didn't have enumsortorder -const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY oid -"; - pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ SELECT attname, atttypid FROM pg_catalog.pg_attribute @@ -56,15 +38,13 @@ AND attnum > 0 ORDER BY attnum "; -static NEXT_ID: AtomicUsize = AtomicUsize::new(0); - pub async fn prepare( client: &Arc, + name: &'static str, query: &str, types: &[Type], ) -> Result { - let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); - let buf = encode(client, &name, query, types)?; + let buf = encode(client, name, query, types)?; let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; match responses.next().await? { @@ -105,10 +85,11 @@ pub async fn prepare( fn prepare_rec<'a>( client: &'a Arc, + name: &'static str, query: &'a str, types: &'a [Type], ) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, query, types)) + Box::pin(prepare(client, name, query, types)) } fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { @@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { - prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; client.set_typeinfo(&stmt); Ok(stmt) @@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { - prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo_enum"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; client.set_typeinfo_enum(&stmt); Ok(stmt) @@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc) -> Result( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if log_enabled!(Level::Debug) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let mut responses = start(client, buf).await?; - - let mut rows = 0; - loop { - match responses.next().await? { - Message::DataRow(_) => {} - Message::CommandComplete(body) => { - rows = body - .tag() - .map_err(Error::parse)? - .rsplit(' ') - .next() - .unwrap() - .parse() - .unwrap_or(0); - } - Message::EmptyQueryResponse => rows = 0, - Message::ReadyForQuery(_) => return Ok(rows), - _ => return Err(Error::unexpected_message()), - } - } -} - async fn start(client: &InnerClient, buf: Bytes) -> Result { let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 22e160fc05..591872fbc5 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -13,7 +13,7 @@ use std::{ struct StatementInner { client: Weak, - name: String, + name: &'static str, params: Vec, columns: Vec, } @@ -22,7 +22,7 @@ impl Drop for StatementInner { fn drop(&mut self) { if let Some(client) = self.client.upgrade() { let buf = client.with_buf(|buf| { - frontend::close(b'S', &self.name, buf).unwrap(); + frontend::close(b'S', self.name, buf).unwrap(); frontend::sync(buf); buf.split().freeze() }); @@ -40,7 +40,7 @@ pub struct Statement(Arc); impl Statement { pub(crate) fn new( inner: &Arc, - name: String, + name: &'static str, params: Vec, columns: Vec, ) -> Statement { @@ -55,14 +55,14 @@ impl Statement { pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { Statement(Arc::new(StatementInner { client: Weak::new(), - name: String::new(), + name: "", params, columns, })) } pub(crate) fn name(&self) -> &str { - &self.0.name + self.0.name } /// Returns the expected types of the statement's parameters. diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs deleted file mode 100644 index 7e12992728..0000000000 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ /dev/null @@ -1,57 +0,0 @@ -use crate::to_statement::private::{Sealed, ToStatementType}; -use crate::Statement; - -mod private { - use crate::{Client, Error, Statement}; - - pub trait Sealed {} - - pub enum ToStatementType<'a> { - Statement(&'a Statement), - Query(&'a str), - } - - impl ToStatementType<'_> { - pub async fn into_statement(self, client: &Client) -> Result { - match self { - ToStatementType::Statement(s) => Ok(s.clone()), - ToStatementType::Query(s) => client.prepare(s).await, - } - } - } -} - -/// A trait abstracting over prepared and unprepared statements. -/// -/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which -/// was prepared previously. -/// -/// This trait is "sealed" and cannot be implemented by anything outside this crate. -pub trait ToStatement: Sealed { - #[doc(hidden)] - fn __convert(&self) -> ToStatementType<'_>; -} - -impl ToStatement for Statement { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Statement(self) - } -} - -impl Sealed for Statement {} - -impl ToStatement for str { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for str {} - -impl ToStatement for String { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for String {} diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index a39fda526f..8b14a4f290 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -9,13 +9,43 @@ use anyhow::bail; use serde::{Deserialize, Serialize}; use utils::id::NodeId; -/// Number uniquely identifying safekeeper configuration. -/// Note: it is a part of sk control file. -pub type Generation = u32; /// 1 is the first valid generation, 0 is used as /// a placeholder before we fully migrate to generations. -pub const INVALID_GENERATION: Generation = 0; -pub const INITIAL_GENERATION: Generation = 1; +pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0); +pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1); + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +/// +/// Like tenant generations, but for safekeepers. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct SafekeeperGeneration(u32); + +impl SafekeeperGeneration { + pub const fn new(v: u32) -> Self { + Self(v) + } + + #[track_caller] + pub fn previous(&self) -> Option { + Some(Self(self.0.checked_sub(1)?)) + } + + #[track_caller] + pub fn next(&self) -> Self { + Self(self.0 + 1) + } + + pub fn into_inner(self) -> u32 { + self.0 + } +} + +impl Display for SafekeeperGeneration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} /// Membership is defined by ids so e.g. walproposer uses them to figure out /// quorums, but we also carry host and port to give wp idea where to connect. @@ -89,7 +119,7 @@ impl Display for MemberSet { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Configuration { /// Unique id. - pub generation: Generation, + pub generation: SafekeeperGeneration, /// Current members of the configuration. pub members: MemberSet, /// Some means it is a joint conf. diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 30418b0efd..41ccdaa428 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse { pub struct SafekeeperUtilization { pub timeline_count: u64, } + +/// pull_timeline request body. +#[derive(Debug, Deserialize, Serialize)] +pub struct PullTimelineRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PullTimelineResponse { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 0f10300959..5020d82adf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -24,11 +24,10 @@ diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -inferno.workspace = true fail.workspace = true futures = { workspace = true } jsonwebtoken.workspace = true -nix.workspace = true +nix = { workspace = true, features = ["ioctl"] } once_cell.workspace = true pin-project-lite.workspace = true regex.workspace = true @@ -62,6 +61,7 @@ bytes.workspace = true criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true +pprof.workspace = true serde_assert.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md new file mode 100644 index 0000000000..e23ec268c2 --- /dev/null +++ b/libs/utils/benches/README.md @@ -0,0 +1,26 @@ +## Utils Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package utils + +# Specific file. +cargo bench --package utils --bench benchmarks + +# Specific benchmark. +cargo bench --package utils --bench benchmarks warn_slow/enabled=true + +# List available benchmarks. +cargo bench --package utils --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 44eb36387c..cff3792f3a 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,18 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pprof::criterion::{Output, PProfProfiler}; use utils::id; +use utils::logging::warn_slow; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_id_stringify, + bench_warn_slow, +); +criterion_main!(benches); pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. @@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -criterion_group!(benches, bench_id_stringify); -criterion_main!(benches); +pub fn bench_warn_slow(c: &mut Criterion) { + for enabled in [false, true] { + c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| { + run_bench(b, enabled).unwrap() + }); + } + + // The actual benchmark. + fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> { + const THRESHOLD: Duration = Duration::from_secs(1); + + // Use a multi-threaded runtime to avoid thread parking overhead when yielding. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling + // performance too. Use a simple noop future that yields once, to avoid any scheduler fast + // paths for a ready future. + if enabled { + b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now()))); + } else { + b.iter(|| runtime.block_on(tokio::task::yield_now())); + } + + Ok(()) + } +} diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 42b45eeea0..4d173d0726 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -286,6 +286,11 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] + struct NewTypeStruct(u32); + const NT1: NewTypeStruct = NewTypeStruct(414243); + const NT1_INNER: u32 = 414243; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, @@ -408,4 +413,42 @@ mod tests { let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn be_nt() { + use super::BeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("0006 5223"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn le_nt() { + use super::LeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("2352 0600"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 820ff2d5ea..9389a27bf3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -93,6 +93,9 @@ pub mod try_rcu; pub mod guard_arc_swap; +#[cfg(target_os = "linux")] +pub mod linux_socket_ioctl; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs new file mode 100644 index 0000000000..5ae0e86af8 --- /dev/null +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -0,0 +1,35 @@ +//! Linux-specific socket ioctls. +//! +//! + +use std::{ + io, + mem::MaybeUninit, + os::{fd::RawFd, raw::c_int}, +}; + +use nix::libc::{FIONREAD, TIOCOUTQ}; + +unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { + let mut inq: MaybeUninit = MaybeUninit::uninit(); + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn inq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, FIONREAD) +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn outq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, TIOCOUTQ) +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 4a6069294d..95c69ac8ba 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,9 +1,13 @@ +use std::future::Future; use std::str::FromStr; +use std::time::Duration; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +use tokio::time::Instant; +use tracing::warn; /// Logs a critical error, similarly to `tracing::error!`. This will: /// @@ -318,6 +322,41 @@ impl std::fmt::Debug for SecretString { } } +/// Logs a periodic warning if a future is slow to complete. +/// +/// This is performance-sensitive as it's used on the GetPage read path. +#[inline] +pub async fn warn_slow(name: &str, threshold: Duration, f: impl Future) -> O { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut f = Box::pin(f); + + let started = Instant::now(); + let mut attempt = 1; + + loop { + // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common + // case where the timeout doesn't fire. + let deadline = started + attempt * threshold; + if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { + // NB: we check if we exceeded the threshold even if the timeout never fired, because + // scheduling or execution delays may cause the future to succeed even if it exceeds the + // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid + // false negatives. + let elapsed = started.elapsed(); + if elapsed >= threshold { + warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); + } + return output; + } + + let elapsed = started.elapsed().as_secs_f64(); + warn!("slow {name} still running after {elapsed:.3}s",); + + attempt += 1; + } +} + #[cfg(test)] mod tests { use metrics::{core::Opts, IntCounterVec}; diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 6352ea9f92..d98284f969 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -117,6 +117,10 @@ impl TenantShardId { ) } + pub fn range(&self) -> RangeInclusive { + RangeInclusive::new(*self, *self) + } + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto index d68484d30f..7b40201a75 100644 --- a/libs/wal_decoder/proto/interpreted_wal.proto +++ b/libs/wal_decoder/proto/interpreted_wal.proto @@ -5,6 +5,7 @@ package interpreted_wal; message InterpretedWalRecords { repeated InterpretedWalRecord records = 1; optional uint64 next_record_lsn = 2; + optional uint64 raw_wal_start_lsn = 3; } message InterpretedWalRecord { diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index 51bf7e44ab..7e1934c6c3 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -60,7 +60,11 @@ pub struct InterpretedWalRecords { pub records: Vec, // Start LSN of the next record after the batch. // Note that said record may not belong to the current shard. - pub next_record_lsn: Option, + pub next_record_lsn: Lsn, + // Inclusive start LSN of the PG WAL from which the interpreted + // WAL records were extracted. Note that this is not necessarily the + // start LSN of the first interpreted record in the batch. + pub raw_wal_start_lsn: Option, } /// An interpreted Postgres WAL record, ready to be handled by the pageserver diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 944ee5c919..52ed5c70b5 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -167,7 +167,8 @@ impl TryFrom for proto::InterpretedWalRecords { .collect::, _>>()?; Ok(proto::InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(|l| l.0), + next_record_lsn: Some(value.next_record_lsn.0), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0), }) } } @@ -254,7 +255,11 @@ impl TryFrom for InterpretedWalRecords { Ok(InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(Lsn::from), + next_record_lsn: value + .next_record_lsn + .map(Lsn::from) + .expect("Always provided"), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from), }) } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index da7ec5abce..bb0f64ca32 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -477,6 +477,26 @@ impl Client { self.request(Method::POST, &uri, ()).await.map(|_| ()) } + pub async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + )) + .expect("Cannot build URL"); + + if let Some(concurrency) = concurrency { + path.query_pairs_mut() + .append_pair("concurrency", &format!("{}", concurrency)); + } + + self.request(Method::POST, path, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index af4b5a21ab..c7f0719c41 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -345,6 +345,7 @@ impl AuxFileV2 { AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) } (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash), (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), (0xff, 0xff) => AuxFileV2::Other(hash), _ => return None, diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5e527b7d61..5cc20a70b2 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key const AUX_DIR_PG_LOGICAL: u8 = 0x01; const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_STAT: u8 = 0x03; const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// Encode the aux file into a fixed-size key. @@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// * pg_logical/replorigin_checkpoint -> 0x0103 /// * pg_logical/others -> 0x01FF /// * pg_replslot/ -> 0x0201 +/// * pg_stat/pgstat.stat -> 0x0301 /// * others -> 0xFFFF /// /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. @@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_replslot/") { aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_stat/") { + aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes()) } else { if cfg!(debug_assertions) { warn!( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 25078b57c8..99b0775316 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,7 +13,7 @@ use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::Key; +use pageserver_api::key::{rel_block_to_key, Key}; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; @@ -264,6 +264,31 @@ where async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + // Construct the pg_control file from the persisted checkpoint and pg_control + // information. But we only add this to the tarball at the end, so that if the + // writing is interrupted half-way through, the resulting incomplete tarball will + // be missing the pg_control file, which prevents PostgreSQL from starting up on + // it. With proper error handling, you should never try to start up from an + // incomplete basebackup in the first place, of course, but this is a nice little + // extra safety measure. + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn, self.ctx) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn, self.ctx) + .await + .context("failed to get control bytes")?; + let (pg_control_bytes, system_identifier, was_shutdown) = + postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; let pgversion = self.timeline.pg_version; @@ -401,6 +426,10 @@ where // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; + } else if path == "pg_stat/pgstat.stat" && !was_shutdown { + // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN + // of a shutdown checkpoint. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar @@ -462,8 +491,9 @@ where ))) }); - // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file().await?; + // Last, add the pg_control file and bootstrap WAL segment. + self.add_pgcontrol_file(pg_control_bytes, system_identifier) + .await?; self.ar .finish() .await @@ -501,13 +531,9 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn( - src, - blknum, - Version::Lsn(self.lsn), - self.ctx, - self.io_concurrency.clone(), - ) + // TODO: investigate using get_vectored for the entire startblk..endblk range. + // But this code path is not on the critical path for most basebackups (?). + .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); @@ -675,7 +701,11 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { + async fn add_pgcontrol_file( + &mut self, + pg_control_bytes: Bytes, + system_identifier: u64, + ) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -698,24 +728,6 @@ where .await .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn, self.ctx) - .await - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn, self.ctx) - .await - .context("failed get control bytes")?; - - let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( - &pg_control_bytes, - &checkpoint_bytes, - self.lsn, - self.timeline.pg_version, - )?; - //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fa098e9364..e2b9a7f073 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -134,6 +134,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c5368f6806..09d9444dd5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -197,6 +197,10 @@ pub struct PageServerConf { /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer /// files read. pub enable_read_path_debugging: bool, + + /// Interpreted protocol feature: if enabled, validate that the logical WAL received from + /// safekeepers does not have gaps. + pub validate_wal_contiguity: bool, } /// Token for authentication to safekeepers @@ -360,6 +364,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, + validate_wal_contiguity, } = config_toml; let mut conf = PageServerConf { @@ -446,6 +451,7 @@ impl PageServerConf { virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), + validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 8f2177fe5b..da9c095a15 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -98,6 +98,7 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + read_path_debug: bool, } /// The kind of access to the page cache. @@ -155,6 +156,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + read_path_debug: false, }, } } @@ -168,6 +170,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + read_path_debug: original.read_path_debug, }, } } @@ -191,6 +194,11 @@ impl RequestContextBuilder { self } + pub(crate) fn read_path_debug(mut self, b: bool) -> Self { + self.inner.read_path_debug = b; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -291,4 +299,8 @@ impl RequestContext { pub(crate) fn page_content_kind(&self) -> PageContentKind { self.page_content_kind } + + pub(crate) fn read_path_debug(&self) -> bool { + self.read_path_debug + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index d41bfd9021..4990f17b40 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -173,6 +173,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + listen_https_port: None, // TODO: Support https. availability_zone_id: az_id.expect("Checked above"), }) } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b8ed7aaf26..12252739fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -824,6 +824,38 @@ paths: schema: $ref: "#/components/schemas/TenantConfigResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: concurrency + description: Maximum number of concurrent downloads (capped at remote storage concurrency) + in: query + required: false + schema: + type: integer + post: + description: | + Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter + may be used to target all shards of a tenant when the unsharded form is used, or a specific + tenant shard with the sharded form. + responses: + "200": + description: Success + delete: + description: Stop any on-going background downloads of heatmap layers for the specified timeline. + responses: + "200": + description: Success + /v1/utilization: get: description: | @@ -882,6 +914,8 @@ components: properties: reason: type: string + gc_blocking: + type: string TenantCreateRequest: allOf: @@ -1083,6 +1117,9 @@ components: min_readable_lsn: type: string format: hex + latest_gc_cutoff_lsn: + type: string + format: hex applied_gc_cutoff_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f6d4f3828c..56a84a98a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use crate::config::PageServerConf; +use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -1463,6 +1464,59 @@ async fn timeline_layer_scan_disposable_keys( ) } +async fn timeline_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // Only used in the case where remote storage is not configured. + const DEFAULT_MAX_CONCURRENCY: usize = 100; + // A conservative default. + const DEFAULT_CONCURRENCY: usize = 16; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let desired_concurrency = + parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let max_concurrency = get_config(&request) + .remote_storage_config + .as_ref() + .map(|c| c.concurrency_limit()) + .unwrap_or(DEFAULT_MAX_CONCURRENCY); + let concurrency = std::cmp::min(max_concurrency, desired_concurrency); + + timeline.start_heatmap_layers_download(concurrency).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn timeline_shutdown_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + timeline.stop_and_drain_heatmap_layers_download().await; + + json_response(StatusCode::OK, ()) +} + async fn layer_download_handler( request: Request, _cancel: CancellationToken, @@ -2519,14 +2573,30 @@ async fn deletion_queue_flush( } } -/// Try if `GetPage@Lsn` is successful, useful for manual debugging. async fn getpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(false, request, cancel).await +} + +async fn touchpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(true, request, cancel).await +} + +/// Try if `GetPage@Lsn` is successful, useful for manual debugging. +async fn getpage_at_lsn_handler_inner( + touch: bool, request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_shard_id.tenant_id))?; + // Require pageserver admin permission for this API instead of only tenant-level token. + check_permission(&request, None)?; let state = get_state(&request); struct Key(pageserver_api::key::Key); @@ -2541,22 +2611,29 @@ async fn getpage_at_lsn_handler( let key: Key = parse_query_param(&request, "key")? .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?; - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn: Option = parse_query_param(&request, "lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + // Enable read path debugging + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + // Use last_record_lsn if no lsn is provided + let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let page = timeline.get(key.0, lsn, &ctx).await?; - Result::<_, ApiError>::Ok( - Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/octet-stream") - .body(hyper::Body::from(page)) - .unwrap(), - ) + if touch { + json_response(StatusCode::OK, ()) + } else { + Result::<_, ApiError>::Ok( + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(page)) + .unwrap(), + ) + } } .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await @@ -3627,6 +3704,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", |r| api_handler(r, layer_map_info_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_download_heatmap_layers_handler), + ) + .delete( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), @@ -3683,6 +3768,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", + |r| api_handler(r, touchpage_at_lsn_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| api_handler(r, timeline_collect_keyspace), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 983a3079e4..e1c26b0684 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::num::NonZeroUsize; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; @@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", &["tenant_id", "shard_id", "timeline_id"], // Low resolution to reduce cardinality. - vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0], + vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); @@ -1439,27 +1440,66 @@ impl Drop for SmgrOpTimer { } impl SmgrOpFlushInProgress { - pub(crate) async fn measure(self, mut started_at: Instant, mut fut: Fut) -> O + /// The caller must guarantee that `socket_fd`` outlives this function. + pub(crate) async fn measure( + self, + started_at: Instant, + mut fut: Fut, + socket_fd: RawFd, + ) -> O where Fut: std::future::Future, { let mut fut = std::pin::pin!(fut); - // Whenever observe_guard gets called, or dropped, - // it adds the time elapsed since its last call to metrics. - // Last call is tracked in `now`. + let mut logged = false; + let mut last_counter_increment_at = started_at; let mut observe_guard = scopeguard::guard( - || { + |is_timeout| { let now = Instant::now(); - let elapsed = now - started_at; - self.global_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.per_timeline_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - started_at = now; + + // Increment counter + { + let elapsed_since_last_observe = now - last_counter_increment_at; + self.global_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + self.per_timeline_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + last_counter_increment_at = now; + } + + // Log something on every timeout, and on completion but only if we hit a timeout. + if is_timeout || logged { + logged = true; + let elapsed_total = now - started_at; + let msg = if is_timeout { + "slow flush ongoing" + } else { + "slow flush completed or cancelled" + }; + + let (inq, outq) = { + // SAFETY: caller guarantees that `socket_fd` outlives this function. + #[cfg(target_os = "linux")] + unsafe { + ( + utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2), + utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2), + ) + } + #[cfg(not(target_os = "linux"))] + { + _ = socket_fd; // appease unused lint on macOS + (-1, -1) + } + }; + + let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64()); + tracing::info!(elapsed_total_secs, inq, outq, msg); + } }, |mut observe| { - observe(); + observe(false); }, ); @@ -1467,7 +1507,7 @@ impl SmgrOpFlushInProgress { match tokio::time::timeout(Duration::from_secs(10), &mut fut).await { Ok(v) => return v, Err(_timeout) => { - (*observe_guard)(); + (*observe_guard)(true); } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bc0ed4198b..7285697040 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -34,11 +34,13 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; +use strum_macros::IntoStaticStr; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::logging::warn_slow; use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; use utils::{ @@ -73,6 +75,7 @@ use pageserver_api::models::PageTraceEvent; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -80,6 +83,9 @@ use postgres_ffi::BLCKSZ; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// Threshold at which to log a warning about slow GetPage requests. +const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); + /////////////////////////////////////////////////////////////////////////////// pub struct Listener { @@ -236,7 +242,7 @@ pub async fn libpq_listener_main( type ConnectionHandlerResult = anyhow::Result<()>; -#[instrument(skip_all, fields(peer_addr))] +#[instrument(skip_all, fields(peer_addr, application_name))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( conf: &'static PageServerConf, @@ -257,6 +263,8 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let socket_fd = socket.as_raw_fd(); + let peer_addr = socket.peer_addr().context("get peer address")?; tracing::Span::current().record("peer_addr", field::display(peer_addr)); @@ -305,7 +313,7 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -591,6 +599,7 @@ struct BatchedTestRequest { /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, /// so that we don't keep the [`Timeline::gate`] open while the batch /// is being built up inside the [`spsc_fold`] (pagestream pipelining). +#[derive(IntoStaticStr)] enum BatchedFeMessage { Exists { span: Span, @@ -635,6 +644,10 @@ enum BatchedFeMessage { } impl BatchedFeMessage { + fn as_static_str(&self) -> &'static str { + self.into() + } + fn observe_execution_start(&mut self, at: Instant) { match self { BatchedFeMessage::Exists { timer, .. } @@ -1286,12 +1299,15 @@ impl PageServerHandler { ))?; // what we want to do + let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { - Some(flushing_timer) => { - futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut)) - } + Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure( + Instant::now(), + flush_fut, + socket_fd, + )), None => futures::future::Either::Right(flush_fut), }; // do it while respecting cancellation @@ -1457,17 +1473,20 @@ impl PageServerHandler { } }; - let err = self - .pagesteam_handle_batched_message( + let result = warn_slow( + msg.as_static_str(), + WARN_SLOW_GETPAGE_THRESHOLD, + self.pagesteam_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), &cancel, protocol_version, ctx, - ) - .await; - match err { + ), + ) + .await; + match result { Ok(()) => {} Err(e) => break e, } @@ -1630,13 +1649,17 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( - pgb_writer, - batch, - io_concurrency.clone(), - &cancel, - protocol_version, - &ctx, + warn_slow( + batch.as_static_str(), + WARN_SLOW_GETPAGE_THRESHOLD, + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + io_concurrency.clone(), + &cancel, + protocol_version, + &ctx, + ), ) .await?; } @@ -1793,6 +1816,13 @@ impl PageServerHandler { .as_millis() .to_string() }); + + info!( + "acquired lease for {} until {}", + lsn, + valid_until_str.as_deref().unwrap_or("") + ); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( @@ -2457,9 +2487,16 @@ where fn startup( &mut self, _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, + sm: &FeStartupPacket, ) -> Result<(), QueryError> { fail::fail_point!("ps::connection-start::startup-packet"); + + if let FeStartupPacket::StartupMessage { params, .. } = sm { + if let Some(app_name) = params.get("application_name") { + Span::current().record("application_name", field::display(app_name)); + } + }; + Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f2dca8befa..d0e2dab042 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -23,13 +23,14 @@ use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; -use pageserver_api::key::Key; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, - slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, + slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, + twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, + CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::key::{rel_tag_sparse_key, Key}; use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; @@ -44,7 +45,7 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -490,12 +491,33 @@ impl Timeline { if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } - // fetch directory listing + + // Read path: first read the new reldir keyspace. Early return if the relation exists. + // Otherwise, read the old reldir keyspace. + // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. + + if self.get_rel_size_v2_enabled() { + // fetch directory listing (new) + let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); + let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + let exists_v2 = buf == RelDirExists::Exists; + // Fast path: if the relation exists in the new format, return true. + // TODO: we should have a verification mode that checks both keyspaces + // to ensure the relation only exists in one of them. + if exists_v2 { + return Ok(true); + } + } + + // fetch directory listing (old) + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) + let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum)); + Ok(exists_v1) } /// Get a list of all existing relations in given tablespace and database. @@ -513,12 +535,12 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - // fetch directory listing + // fetch directory listing (old) let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - let rels: HashSet = + let rels_v1: HashSet = HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { spcnode, dbnode, @@ -526,6 +548,46 @@ impl Timeline { forknum: *forknum, })); + if !self.get_rel_size_v2_enabled() { + return Ok(rels_v1); + } + + // scan directory listing (new), merge with the old results + let key_range = rel_tag_sparse_key_range(spcnode, dbnode); + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + let results = self + .scan( + KeySpace::single(key_range), + version.get_lsn(), + ctx, + io_concurrency, + ) + .await?; + let mut rels = rels_v1; + for (key, val) in results { + let val = RelDirExists::decode(&val?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + assert_eq!(key.field6, 1); + assert_eq!(key.field2, spcnode); + assert_eq!(key.field3, dbnode); + let tag = RelTag { + spcnode, + dbnode, + relnode: key.field4, + forknum: key.field5, + }; + if val == RelDirExists::Removed { + debug_assert!(!rels.contains(&tag), "removed reltag in v2"); + continue; + } + let did_not_contain = rels.insert(tag); + debug_assert!(did_not_contain, "duplicate reltag in v2"); + } Ok(rels) } @@ -1144,7 +1206,11 @@ impl Timeline { let dense_keyspace = result.to_keyspace(); let sparse_keyspace = SparseKeySpace(KeySpace { - ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + ranges: vec![ + Key::metadata_aux_key_range(), + repl_origin_key_range(), + Key::rel_dir_sparse_key_range(), + ], }); if cfg!(debug_assertions) { @@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> { /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. - pending_directory_entries: Vec<(DirectoryKind, usize)>, + pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>, /// An **approximation** of how many metadata bytes will be written to the EphemeralFile. pending_metadata_bytes: usize, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MetricsUpdate { + /// Set the metrics to this value + Set(u64), + /// Increment the metrics by this value + Add(u64), + /// Decrement the metrics by this value + Sub(u64), +} + impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we @@ -1359,7 +1435,8 @@ impl DatadirModification<'_> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Db, 0)); + self.pending_directory_entries + .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); let buf = if self.tline.pg_version >= 17 { @@ -1372,7 +1449,7 @@ impl DatadirModification<'_> { }) }?; self.pending_directory_entries - .push((DirectoryKind::TwoPhase, 0)); + .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0))); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); @@ -1382,17 +1459,23 @@ impl DatadirModification<'_> { // harmless but they'd just be dropped on later compaction. if self.tline.tenant_shard_id.is_shard_zero() { self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), + MetricsUpdate::Set(0), + )); } Ok(()) @@ -1658,10 +1741,16 @@ impl DatadirModification<'_> { } if r.is_none() { // Create RelDirectory + // TODO: if we have fully migrated to v2, no need to create this directory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Rel, 0)); + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + if self.tline.get_rel_size_v2_enabled() { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + } self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1685,8 +1774,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid = xid as u32; @@ -1694,8 +1785,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -1744,8 +1837,10 @@ impl DatadirModification<'_> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; - self.pending_directory_entries - .push((DirectoryKind::Db, dir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1778,39 +1873,85 @@ impl DatadirModification<'_> { // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; - let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = + + let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dbdir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); - - // and create the RelDirectory - RelDirectory::default() + false } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + true }; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if !dbdir_exists { + // Create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; + // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } - self.pending_directory_entries - .push((DirectoryKind::Rel, rel_dir.rels.len())); - - self.put( - rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), - ); - + if self.tline.get_rel_size_v2_enabled() { + let sparse_rel_dir_key = + rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); + // check if the rel_dir_key exists in v2 + let val = self + .sparse_get(sparse_rel_dir_key, ctx) + .await + .map_err(|e| RelationError::Other(e.into()))?; + let val = RelDirExists::decode_option(val) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + return Err(RelationError::AlreadyExists); + } + self.put( + sparse_rel_dir_key, + Value::Image(RelDirExists::Exists.encode()), + ); + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation. + // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there + // will be key not found errors if we don't create an empty one for rel_size_v2. + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&RelDirectory::default()).context("serialize")?, + )), + ); + } + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); + } else { + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) + } + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&rel_dir).context("serialize")?, + )), + ); + } // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -1896,9 +2037,34 @@ impl DatadirModification<'_> { let mut dirty = false; for rel_tag in rel_tags { - if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; + true + } else if self.tline.get_rel_size_v2_enabled() { + // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. + // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion + // logic). + let key = + rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); + let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); + // put tombstone + self.put(key, Value::Image(RelDirExists::Removed.encode())); + // no need to set dirty to true + true + } else { + false + } + } else { + false + }; + if found { // update logical size let size_key = rel_size_to_key(rel_tag); let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1914,8 +2080,6 @@ impl DatadirModification<'_> { if dirty { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); - self.pending_directory_entries - .push((DirectoryKind::Rel, dir.rels.len())); } } @@ -1939,8 +2103,10 @@ impl DatadirModification<'_> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1987,8 +2153,10 @@ impl DatadirModification<'_> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -2020,8 +2188,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid: u32 = u32::try_from(xid)?; @@ -2030,8 +2200,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -2092,6 +2264,13 @@ impl DatadirModification<'_> { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } + // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit. + // Compute doesn't know if previous version of this file exists or not, so + // attempt to delete non-existing file can cause this message. + // To avoid false alarms, log it as info rather than warning. + (None, true) if path.starts_with("pg_stat/") => { + info!("removing non-existing pg_stat file: {}", path) + } (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files)?; @@ -2147,7 +2326,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } Ok(()) @@ -2233,7 +2412,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } self.pending_metadata_bytes = 0; @@ -2297,6 +2476,22 @@ impl DatadirModification<'_> { self.tline.get(key, lsn, ctx).await } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2379,6 +2574,23 @@ impl Version<'_> { } } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(timeline, key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn get_lsn(&self) -> Lsn { match self { Version::Lsn(lsn) => *lsn, @@ -2438,6 +2650,7 @@ pub(crate) enum DirectoryKind { Rel, AuxFiles, SlruSegment(SlruKind), + RelV2, } impl DirectoryKind { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2b6ed0e9a2..efb35625f2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3101,6 +3101,9 @@ impl Tenant { if let Some(queue) = queue { outcome = queue .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) .await?; } } @@ -3930,6 +3933,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + pub fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -5646,7 +5656,7 @@ pub(crate) mod harness { lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, - rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled, + rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled), gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), gc_compaction_initial_threshold_kb: Some( tenant_conf.gc_compaction_initial_threshold_kb, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7fdfd736ad..ab4c4c935d 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -485,7 +485,9 @@ impl TenantConfOpt { wal_receiver_protocol_override: self .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), - rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled), + rel_size_v2_enabled: self + .rel_size_v2_enabled + .unwrap_or(global_conf.rel_size_v2_enabled), gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), @@ -691,16 +693,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { + // TODO(vlad): These are now the same, but they have different serialization logic. + // Can we merge them? fn from(value: TenantConfOpt) -> Self { - fn humantime(d: Duration) -> String { - format!("{}s", d.as_secs()) - } Self { checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout.map(humantime), + checkpoint_timeout: value.checkpoint_timeout, compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period.map(humantime), + compaction_period: value.compaction_period, compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, compaction_l0_first: value.compaction_l0_first, @@ -709,24 +710,23 @@ impl From for models::TenantConfig { l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, - gc_period: value.gc_period.map(humantime), + gc_period: value.gc_period, image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval.map(humantime), - walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), - lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + pitr_interval: value.pitr_interval, + walreceiver_connect_timeout: value.walreceiver_connect_timeout, + lagging_wal_timeout: value.lagging_wal_timeout, max_lsn_wal_lag: value.max_lsn_wal_lag, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold - .map(humantime), - heatmap_period: value.heatmap_period.map(humantime), + .evictions_low_residence_duration_metric_threshold, + heatmap_period: value.heatmap_period, lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, image_creation_preempt_threshold: value.image_creation_preempt_threshold, - lsn_lease_length: value.lsn_lease_length.map(humantime), - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + lsn_lease_length: value.lsn_lease_length, + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, rel_size_v2_enabled: value.rel_size_v2_enabled, @@ -758,29 +758,10 @@ mod tests { assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - #[test] - fn test_try_from_models_tenant_config_err() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5a".to_string()), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); - - assert!( - tenant_conf_opt.is_err(), - "Suceeded to convert TenantConfig to TenantConfOpt" - ); - - let expected_error_str = - "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; - assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); - } - #[test] fn test_try_from_models_tenant_config_success() { let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5s".to_string()), + lagging_wal_timeout: Some(Duration::from_secs(5)), ..TenantConfig::default() }; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 885c50425f..7ba0e3679f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -51,8 +51,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; +use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; @@ -967,7 +966,10 @@ impl DeltaLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c49281dc45..dc611bd6e1 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -48,8 +48,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; +use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_api::value::Value; @@ -603,7 +602,10 @@ impl ImageLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2ed4bcb03e..30de4d90dc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +mod heatmap_layers_downloader; pub(crate) mod import_pgdata; mod init; pub mod layer_manager; @@ -21,6 +22,7 @@ use chrono::{DateTime, Utc}; use compaction::CompactionOutcome; use enumset::EnumSet; use fail::fail_point; +use futures::FutureExt; use futures::{stream::FuturesUnordered, StreamExt}; use handle::ShardTimelineId; use layer_manager::Shutdown; @@ -117,7 +119,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -327,6 +329,7 @@ pub struct Timeline { // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM], directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], /// Ensures layers aren't frozen by checkpointer between @@ -466,6 +469,10 @@ pub struct Timeline { pub(crate) page_trace: ArcSwapOption>, previous_heatmap: ArcSwapOption, + + /// May host a background Tokio task which downloads all the layers from the current + /// heatmap on demand. + heatmap_layers_downloader: Mutex>, } pub(crate) enum PreviousHeatmap { @@ -1292,7 +1299,7 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let read_path = if self.conf.enable_read_path_debugging { + let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { Some(ReadPath::new(keyspace.clone(), lsn)) } else { None @@ -2038,6 +2045,11 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // If we have a background task downloading heatmap layers stop it. + // The background downloads are sensitive to timeline cancellation (done above), + // so the drain will be immediate. + self.stop_and_drain_heatmap_layers_download().await; + // Ensure Prevent new page service requests from starting. self.handles.shutdown(); @@ -2355,6 +2367,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2664,6 +2684,7 @@ impl Timeline { ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), @@ -2742,6 +2763,8 @@ impl Timeline { page_trace: Default::default(), previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), + + heatmap_layers_downloader: Mutex::new(None), }; result.repartition_threshold = @@ -2851,6 +2874,7 @@ impl Timeline { auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), ingest_batch_size: self.conf.ingest_batch_size, + validate_wal_contiguity: self.conf.validate_wal_contiguity, }, broker_client, ctx, @@ -3430,8 +3454,42 @@ impl Timeline { } } - pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { - self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) { + // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system + // for each of the database, but we only store one value, and therefore each pgdirmodification + // would overwrite the previous value if they modify different databases. + + match count { + MetricsUpdate::Set(count) => { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed); + } + MetricsUpdate::Add(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub + // the value reliably. + self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + MetricsUpdate::Sub(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before. + // The operation could overflow so we need to normalize the value. + let prev_val = + self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed); + let res = prev_val.saturating_sub(count); + self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + }; + + // TODO: remove this, there's no place in the code that updates this aux metrics. let aux_metric = self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); @@ -3649,7 +3707,9 @@ impl Timeline { // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); - // Do not fire missing key error for sparse keys. + // Do not fire missing key error and end early for sparse keys. Note that we hava already removed + // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of + // figuring out what is the inherited key range and do a fine-grained pruning. removed.remove_overlapping_with(&KeySpace { ranges: vec![SPARSE_RANGE], }); @@ -5070,20 +5130,26 @@ impl Timeline { // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation // if there are too many of them. - let num_of_l0_layers = { - let layers = self.layers.read().await; - layers.layer_map()?.level0_deltas().len() - }; let image_preempt_threshold = self.get_image_creation_preempt_threshold() * self.get_compaction_threshold(); - if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold { - tracing::info!( - "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}", - partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers - ); - last_partition_processed = Some(partition.clone()); - all_generated = false; - break; + // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield + // when there is a single timeline with more than L0 threshold L0 layers. As long as the + // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction. + if image_preempt_threshold != 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", + partition.start().unwrap(), partition.end().unwrap() + ); + last_partition_processed = Some(partition.clone()); + all_generated = false; + break; + } } } } @@ -5112,14 +5178,16 @@ impl Timeline { .map(|l| l.metadata().file_size) .sum::(); - info!( - "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", - image_layers.len(), - total_layer_size, - duration.as_secs_f64(), - partition_processed, - total_partitions - ); + if !image_layers.is_empty() { + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + } Ok(( image_layers, @@ -5277,12 +5345,6 @@ impl From for CompactionError { } } -impl CompactionError { - pub fn is_cancelled(&self) -> bool { - matches!(self, CompactionError::ShuttingDown) - } -} - impl From for CompactionError { fn from(err: CollectKeySpaceError) -> Self { match err { @@ -6547,7 +6609,7 @@ impl TimelineWriter<'_> { if let Some(wait_threshold) = wait_threshold { if l0_count >= wait_threshold { - info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); + debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"); self.tl.wait_flush_completion(flush_id).await?; } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 1b25332212..0361ce8cd1 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -26,7 +26,7 @@ use pageserver_api::models::CompactInfoResponse; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use utils::critical; use utils::id::TimelineId; @@ -303,18 +303,12 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.gc_guards.insert(id, gc_guard); } - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); } } GcCompactionQueueItem::SubCompactionJob(options) => { - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; } GcCompactionQueueItem::Notify(id) => { self.notify_and_unblock(id); @@ -781,27 +775,25 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { - if let CompactionError::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)), - ) = err - { - critical!("could not compact, repartitioning keyspace failed: {err:?}"); - } else { - tracing::error!( - "could not compact, repartitioning keyspace failed: {err:?}" - ); - } - } - } + + // Suppress errors when cancelled. + Err(_) if self.cancel.is_cancelled() => {} + Err(CompactionError::ShuttingDown) => {} + + // Alert on critical errors that indicate data corruption. + Err( + err @ CompactionError::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + ), + ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + + // Log other errors. No partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; let partition_count = self.partitioning.read().0 .0.parts.len(); @@ -2220,7 +2212,7 @@ impl Timeline { let sub_compaction_max_job_size_mb = sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB); - let mut compact_jobs = Vec::new(); + let mut compact_jobs = Vec::::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); @@ -2307,16 +2299,25 @@ impl Timeline { } else { end }; - info!( - "splitting compaction job: {}..{}, estimated_size={}", - start, end, total_size - ); - compact_jobs.push(GcCompactJob { - dry_run: job.dry_run, - compact_key_range: start..end, - compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, - }); - current_start = Some(end); + if total_size == 0 && !compact_jobs.is_empty() { + info!( + "splitting compaction job: {}..{}, estimated_size={}, extending the previous job", + start, end, total_size + ); + compact_jobs.last_mut().unwrap().compact_key_range.end = end; + current_start = Some(end); + } else { + info!( + "splitting compaction job: {}..{}, estimated_size={}", + start, end, total_size + ); + compact_jobs.push(GcCompactJob { + dry_run: job.dry_run, + compact_key_range: start..end, + compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, + }); + current_start = Some(end); + } } } Ok(compact_jobs) diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs new file mode 100644 index 0000000000..0ba9753e85 --- /dev/null +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -0,0 +1,162 @@ +//! Timeline utility module to hydrate everything from the current heatmap. +//! +//! Provides utilities to spawn and abort a background task where the downloads happen. +//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. + +use futures::StreamExt; +use http_utils::error::ApiError; +use std::sync::{Arc, Mutex}; +use tokio_util::sync::CancellationToken; +use utils::sync::gate::Gate; + +use super::Timeline; + +// This status is not strictly necessary now, but gives us a nice place +// to store progress information if we ever wish to expose it. +pub(super) enum HeatmapLayersDownloadStatus { + InProgress, + Complete, +} + +pub(super) struct HeatmapLayersDownloader { + handle: tokio::task::JoinHandle<()>, + status: Arc>, + cancel: CancellationToken, + downloads_guard: Arc, +} + +impl HeatmapLayersDownloader { + fn new( + timeline: Arc, + concurrency: usize, + ) -> Result { + let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; + + let cancel = timeline.cancel.child_token(); + let downloads_guard = Arc::new(Gate::default()); + + let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress)); + + let handle = tokio::task::spawn({ + let status = status.clone(); + let downloads_guard = downloads_guard.clone(); + let cancel = cancel.clone(); + + async move { + let _guard = tl_guard; + + scopeguard::defer! { + *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete; + } + + let Some(heatmap) = timeline.generate_heatmap().await else { + tracing::info!("Heatmap layers download failed to generate heatmap"); + return; + }; + + tracing::info!( + resident_size=%timeline.resident_physical_size(), + heatmap_layers=%heatmap.layers.len(), + "Starting heatmap layers download" + ); + + let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + |layer| { + let tl = timeline.clone(); + let dl_guard = match downloads_guard.enter() { + Ok(g) => g, + Err(_) => { + // [`Self::shutdown`] was called. Don't spawn any more downloads. + return None; + } + }; + + Some(async move { + let _dl_guard = dl_guard; + + let res = tl.download_layer(&layer.name).await; + if let Err(err) = res { + if !err.is_cancelled() { + tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") + } + } + }) + } + )).buffered(concurrency); + + tokio::select! { + _ = stream.collect::<()>() => { + tracing::info!( + resident_size=%timeline.resident_physical_size(), + "Heatmap layers download completed" + ); + }, + _ = cancel.cancelled() => { + tracing::info!("Heatmap layers download cancelled"); + } + } + } + }); + + Ok(Self { + status, + handle, + cancel, + downloads_guard, + }) + } + + fn is_complete(&self) -> bool { + matches!( + *self.status.lock().unwrap(), + HeatmapLayersDownloadStatus::Complete + ) + } + + /// Drive any in-progress downloads to completion and stop spawning any new ones. + /// + /// This has two callers and they behave differently + /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves + /// are sensitive to timeline cancellation. + /// + /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress + /// downloads to complete. + async fn stop_and_drain(self) { + // Counterintuitive: close the guard before cancelling. + // Something needs to poll the already created download futures to completion. + // If we cancel first, then the underlying task exits and we lost + // the poller. + self.downloads_guard.close().await; + self.cancel.cancel(); + if let Err(err) = self.handle.await { + tracing::warn!("Failed to join heatmap layer downloader task: {err}"); + } + } +} + +impl Timeline { + pub(crate) async fn start_heatmap_layers_download( + self: &Arc, + concurrency: usize, + ) -> Result<(), ApiError> { + let mut locked = self.heatmap_layers_downloader.lock().unwrap(); + if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { + let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + *locked = Some(dl); + Ok(()) + } else { + Err(ApiError::Conflict("Already running".to_string())) + } + } + + pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) { + // This can race with the start of a new downloader and lead to a situation + // where one donloader is shutting down and another one is in-flight. + // The only impact is that we'd end up using more remote storage semaphore + // units than expected. + let downloader = self.heatmap_layers_downloader.lock().unwrap().take(); + if let Some(dl) = downloader { + dl.stop_and_drain().await; + } + } +} diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3b5bf8290c..93e5a1100d 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -7,7 +7,9 @@ use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; -use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; +use crate::tenant::{ + DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, +}; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { @@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let (timeline, guard) = make_timeline_delete_guard( + let delete_guard_res = make_timeline_delete_guard( tenant, timeline.timeline_id, TimelineDeleteGuardKind::Offload, - ) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + ); + if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); + return Err(OffloadError::NotArchived); + }; + let (timeline, guard) = + delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f831f5e48a..67429bff98 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -56,6 +56,7 @@ pub struct WalReceiverConf { pub auth_token: Option>, pub availability_zone: Option, pub ingest_batch_size: u64, + pub validate_wal_contiguity: bool, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 65f9d39078..1955345315 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -537,6 +537,7 @@ impl ConnectionManagerState { let connect_timeout = self.conf.wal_connect_timeout; let ingest_batch_size = self.conf.ingest_batch_size; let protocol = self.conf.protocol; + let validate_wal_contiguity = self.conf.validate_wal_contiguity; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -558,6 +559,7 @@ impl ConnectionManagerState { ctx, node_id, ingest_batch_size, + validate_wal_contiguity, ) .await; @@ -1563,6 +1565,7 @@ mod tests { auth_token: None, availability_zone: None, ingest_batch_size: 1, + validate_wal_contiguity: false, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 23db4f88d2..ff05a8f902 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -120,6 +120,7 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, safekeeper_node: NodeId, ingest_batch_size: u64, + validate_wal_contiguity: bool, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -274,6 +275,7 @@ pub(super) async fn handle_walreceiver_connection( } => Some((format, compression)), }; + let mut expected_wal_start = startpoint; while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { @@ -340,13 +342,49 @@ pub(super) async fn handle_walreceiver_connection( ) })?; + // Guard against WAL gaps. If the start LSN of the PG WAL section + // from which the interpreted records were extracted, doesn't match + // the end of the previous batch (or the starting point for the first batch), + // then kill this WAL receiver connection and start a new one. + if validate_wal_contiguity { + if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn { + match raw_wal_start_lsn.cmp(&expected_wal_start) { + std::cmp::Ordering::Greater => { + let msg = format!( + "Gap in streamed WAL: [{}, {})", + expected_wal_start, raw_wal_start_lsn + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + std::cmp::Ordering::Less => { + // Other shards are reading WAL behind us. + // This is valid, but check that we received records + // that we haven't seen before. + if let Some(first_rec) = batch.records.first() { + if first_rec.next_record_lsn < last_rec_lsn { + let msg = format!( + "Received record with next_record_lsn multiple times ({} < {})", + first_rec.next_record_lsn, expected_wal_start + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + } + } + std::cmp::Ordering::Equal => {} + } + } + } + let InterpretedWalRecords { records, next_record_lsn, + raw_wal_start_lsn: _, } = batch; tracing::debug!( - "Received WAL up to {} with next_record_lsn={:?}", + "Received WAL up to {} with next_record_lsn={}", streaming_lsn, next_record_lsn ); @@ -423,12 +461,11 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn { - Some(lsn) if lsn > modification.get_lsn() => { - modification.set_lsn(lsn).unwrap(); - true - } - _ => false, + let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() { + modification.set_lsn(next_record_lsn).unwrap(); + true + } else { + false }; if uncommitted_records > 0 || needs_last_record_lsn_advance { @@ -446,9 +483,8 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); - if let Some(lsn) = next_record_lsn { - last_rec_lsn = lsn; - } + last_rec_lsn = next_record_lsn; + expected_wal_start = streaming_lsn; Some(streaming_lsn) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 04edb3e3f4..45c87353a7 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1180,6 +1180,50 @@ impl WalIngest { } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } + // NB: We abuse the Checkpoint.redo field: + // + // - In PostgreSQL, the Checkpoint struct doesn't store the information + // of whether this is an online checkpoint or a shutdown checkpoint. It's + // stored in the XLOG info field of the WAL record, shutdown checkpoints + // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use + // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers + // in the pageserver, however. + // + // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the + // checkpoint record, if it's a shutdown checkpoint. But when we are + // starting from a shutdown checkpoint, the basebackup LSN is the *end* + // of the shutdown checkpoint WAL record. That makes it difficult to + // correctly detect whether we're starting from a shutdown record or + // not. + // + // To address both of those issues, we store 0 in the redo field if it's + // an online checkpoint record, and the record's *end* LSN if it's a + // shutdown checkpoint. We don't need the original redo pointer in neon, + // because we don't perform WAL replay at startup anyway, so we can get + // away with abusing the redo field like this. + // + // XXX: Ideally, we would persist the extra information in a more + // explicit format, rather than repurpose the fields of the Postgres + // struct like this. However, we already have persisted data like this, + // so we need to maintain backwards compatibility. + // + // NB: We didn't originally have this convention, so there are still old + // persisted records that didn't do this. Before, we didn't update the + // persisted redo field at all. That means that old records have a bogus + // redo pointer that points to some old value, from the checkpoint record + // that was originally imported from the data directory. If it was a + // project created in Neon, that means it points to the first checkpoint + // after initdb. That's OK for our purposes: all such old checkpoints are + // treated as old online checkpoints when the basebackup is created. + cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + // Store the *end* LSN of the checkpoint record. Or to be precise, + // the start LSN of the *next* record, i.e. if the record ends + // exactly at page boundary, the redo LSN points to just after the + // page header on the next page. + lsn.into() + } else { + Lsn::INVALID.into() + }; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index 6e558c433a..0331f961b4 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -18,6 +18,8 @@ #include "neon_utils.h" static int extension_server_port = 0; +static int extension_server_request_timeout = 60; +static int extension_server_connect_timeout = 60; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - static CURL *handle = NULL; - CURLcode res; - char *compute_ctl_url; bool ret = false; + CURL *handle = NULL; + char *compute_ctl_url; - if (handle == NULL) - { - handle = alloc_curl_handle(); + handle = alloc_curl_handle(); - curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ ); - } + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + if (extension_server_request_timeout > 0) + curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ ); + if (extension_server_connect_timeout > 0) + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ ); compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", extension_server_port, filename, is_library ? "?is_library=true" : ""); @@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library) /* Perform the request, res will get the return code */ res = curl_easy_perform(handle); + curl_easy_cleanup(handle); + /* Check for errors */ if (res == CURLE_OK) { @@ -88,6 +91,24 @@ pg_init_extension_server() 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.extension_server_request_timeout", + "timeout for fetching extensions in seconds", + NULL, + &extension_server_request_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.extension_server_connect_timeout", + "timeout for connecting to the extension server in seconds", + NULL, + &extension_server_connect_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + /* set download_extension_file_hook */ prev_download_extension_file_hook = download_extension_file_hook; download_extension_file_hook = neon_download_extension_file_http; diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 1f53c8fd36..bbaad09f5f 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash) index = hash >> HLL_C_BITS; /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ - count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; + Assert(count <= HLL_C_BITS); cState->regs[index][count] = now; } @@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since) { if (reg[i] >= since) { - max = i; + max = i + 1; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 22aeb2e2d6..fc1aecd340 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel) { case PS_Disconnected: { - const char *keywords[3]; - const char *values[3]; + const char *keywords[4]; + const char *values[4]; + char pid_str[16]; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; @@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel) * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ - keywords[0] = "dbname"; - values[0] = connstr; - n_pgsql_params = 1; + n_pgsql_params = 0; + + /* + * Pageserver logs include this in the connection's tracing span. + * This allows for reasier log correlation between compute and pageserver. + */ + keywords[n_pgsql_params] = "application_name"; + { + int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid); + if (ret < 0 || ret >= (int)(sizeof(pid_str))) + elog(FATAL, "stack-allocated buffer too small to hold pid"); + } + /* lifetime: PQconnectStartParams strdups internally */ + values[n_pgsql_params] = (const char*) pid_str; + n_pgsql_params++; + + keywords[n_pgsql_params] = "dbname"; + values[n_pgsql_params] = connstr; + n_pgsql_params++; if (neon_auth_token) { - keywords[1] = "password"; - values[1] = neon_auth_token; + keywords[n_pgsql_params] = "password"; + values[n_pgsql_params] = neon_auth_token; n_pgsql_params++; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f1087a8ccb..6c812f347f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -474,8 +474,7 @@ readahead_buffer_resize(int newsize, void *extra) */ if (MyPState->n_requests_inflight > newsize) { - Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize); - prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize)); + prefetch_wait_for(MyPState->ring_unused - newsize - 1); Assert(MyPState->n_requests_inflight <= newsize); } diff --git a/poetry.lock b/poetry.lock index fd200159b9..d66c3aae7a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -412,6 +412,7 @@ files = [ [package.dependencies] botocore-stubs = "*" +mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""} mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""} types-s3transfer = "*" typing-extensions = ">=4.1.0" @@ -2022,6 +2023,18 @@ install-types = ["pip"] mypyc = ["setuptools (>=50)"] reports = ["lxml"] +[[package]] +name = "mypy-boto3-kms" +version = "1.26.147" +description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"}, + {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"}, +] + [[package]] name = "mypy-boto3-s3" version = "1.26.0.post1" @@ -2758,18 +2771,18 @@ pytest = ">=5,<8" [[package]] name = "pytest-timeout" -version = "2.1.0" +version = "2.3.1" description = "pytest plugin to abort hanging tests" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" groups = ["main"] files = [ - {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, - {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, + {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"}, + {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"}, ] [package.dependencies] -pytest = ">=5.0.0" +pytest = ">=7.0.0" [[package]] name = "pytest-xdist" @@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7" +content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 3aa6ac3a76..6a381bf094 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -19,7 +19,6 @@ aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true -boxcar = "0.2.8" bstr.workspace = true bytes = { workspace = true, features = ["serde"] } camino.workspace = true @@ -63,7 +62,6 @@ postgres_backend.workspace = true postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true -prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } @@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true -strum.workspace = true strum_macros.workspace = true subtle.workspace = true thiserror.workspace = true @@ -95,7 +92,6 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true tracing-log.workspace = true -tracing-serde.workspace = true tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 9be29c38c9..7503b4eac9 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -140,9 +140,8 @@ async fn authenticate( let (psql_session_id, waiter) = loop { let psql_session_id = new_psql_session_id(); - match control_plane::mgmt::get_waiter(&psql_session_id) { - Ok(waiter) => break (psql_session_id, waiter), - Err(_e) => continue, + if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) { + break (psql_session_id, waiter); } }; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index e05a693cee..5d032c0deb 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -220,11 +220,11 @@ async fn fetch_jwks( } impl JwkCacheEntryLock { - async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + async fn acquire_permit(self: &Arc) -> JwkRenewalPermit<'_> { JwkRenewalPermit::acquire_permit(self).await } - fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + fn try_acquire_permit(self: &Arc) -> Option> { JwkRenewalPermit::try_acquire_permit(self) } @@ -393,7 +393,7 @@ impl JwkCacheEntryLock { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => return Err(JwtError::UnsupportedKeyType(key.into())), - }; + } tracing::debug!(?payload, "JWT signature valid with claims"); @@ -510,7 +510,7 @@ fn verify_rsa_signature( key.verify(data, &sig)?; } _ => return Err(JwtError::InvalidRsaSigningAlgorithm), - }; + } Ok(()) } diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index e0d8515375..4ab11f828c 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -4,6 +4,20 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use compute_api::spec::LocalProxySpec; +use futures::future::Either; +use thiserror::Error; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; + use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; use crate::auth::{self}; @@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions}; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::types::RoleName; use crate::url::ApiUrl; -use anyhow::{bail, ensure, Context}; -use camino::{Utf8Path, Utf8PathBuf}; -use compute_api::spec::LocalProxySpec; -use futures::future::Either; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); -use clap::Parser; -use thiserror::Error; -use tokio::net::TcpListener; -use tokio::sync::Notify; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use utils::sentry_init::init_sentry; -use utils::{pid_file, project_build_tag, project_git_version}; - /// Neon proxy/router #[derive(Parser)] #[command(version = GIT_VERSION, about)] diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 235e9674c6..94e771a61c 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -5,12 +5,6 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; -use crate::context::RequestContext; -use crate::metrics::{Metrics, ThreadPoolMetrics}; -use crate::protocol2::ConnectionInfo; -use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use crate::stream::{PqStream, Stream}; -use crate::tls::TlsServerEndPoint; use anyhow::{anyhow, bail, ensure, Context}; use clap::Arg; use futures::future::Either; @@ -25,6 +19,13 @@ use tracing::{error, info, Instrument}; use utils::project_git_version; use utils::sentry_init::init_sentry; +use crate::context::RequestContext; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::protocol2::ConnectionInfo; +use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; + project_git_version!(GIT_VERSION); fn cli() -> clap::Command { diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e38c49ca10..b72799df54 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -3,6 +3,16 @@ use std::pin::pin; use std::sync::Arc; use std::time::Duration; +use anyhow::bail; +use futures::future::Either; +use remote_storage::RemoteStorageConfig; +use tokio::net::TcpListener; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; + use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; use crate::cancellation::{handle_cancel_messages, CancellationHandler}; @@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; use crate::tls::client_config::compute_client_config_with_root_certs; use crate::{auth, control_plane, http, serverless, usage_metrics}; -use anyhow::bail; -use futures::future::Either; -use remote_storage::RemoteStorageConfig; -use tokio::net::TcpListener; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::sentry_init::init_sentry; -use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> { match auth_backend { Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), - }; + } info!("Using region: {}", args.aws_region); // TODO: untangle the config args @@ -803,9 +804,10 @@ fn build_auth_backend( mod tests { use std::time::Duration; - use crate::rate_limiter::RateBucketInfo; use clap::Parser; + use crate::rate_limiter::RateBucketInfo; + #[test] fn parse_endpoint_rps_limit() { let config = super::ProxyCliArgs::parse_from([ diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index b5c42cd23d..8ec1a4648b 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -242,7 +242,7 @@ impl EndpointsCache { }); tracing::error!("error parsing value {value:?}: {err:?}"); } - }; + } } if total.is_power_of_two() { tracing::debug!("endpoints read {}", total); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index d71465765f..5447a4a4c0 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -137,8 +137,8 @@ impl ConnCfg { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. - "user" if self.user_is_set() => continue, - "database" if self.db_is_set() => continue, + "user" if self.user_is_set() => {} + "database" if self.db_is_set() => {} "options" => { if let Some(options) = filtered_options(v) { self.set_param(k, &options); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index c4548a7ddd..1044f5f8e2 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -82,7 +82,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index f92e4f3f60..89ec4f9b33 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; -use crate::intern::AccountIdInt; -use crate::intern::ProjectIdInt; +use crate::intern::{AccountIdInt, ProjectIdInt}; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, scram}; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 97c9f5a59c..fbd4811b54 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -7,9 +7,8 @@ use chrono::{DateTime, Utc}; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; -use tracing::span; use tracing::subscriber::Interest; -use tracing::{callsite, Event, Metadata, Span, Subscriber}; +use tracing::{callsite, span, Event, Metadata, Span, Subscriber}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 0dc97b7097..74a15d9bf4 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol( // if no more bytes available then exit if bytes_read == 0 { return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - }; + } // check if we have enough bytes to continue if let Some(header) = buf.try_get::() { @@ -169,7 +169,7 @@ fn process_proxy_payload( header.version_and_command ), )), - }; + } let size_err = "invalid proxy protocol length. payload not large enough to fit requested IP addresses"; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index dd145e6bb2..26fb1754bf 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -198,7 +198,7 @@ where warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } - }; + } let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 8a407c8119..2a406fcb34 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -118,7 +118,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 1a7024588a..5f9f2509e2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -169,7 +169,7 @@ impl MessageHandler { }); tracing::error!("broken message: {e}"); } - }; + } return Ok(()); } Ok(msg) => msg, @@ -180,7 +180,7 @@ impl MessageHandler { match serde_json::from_str::(&payload) { Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), Err(_) => tracing::error!("broken message: {e}"), - }; + } return Ok(()); } }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index edc2935618..f35c375ba2 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -372,7 +372,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.execute("select auth.init()", &[]).await { + if let Err(e) = client.batch_execute("select auth.init();").await { discard.discard(); return Err(e.into()); } @@ -651,7 +651,7 @@ async fn connect_http2( e, ))); } - }; + } }; let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index fe33f0ff65..137a2d6377 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -23,7 +23,6 @@ use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; -use postgres_client::types::ToSql; use postgres_client::AsyncMessage; use serde_json::value::RawValue; use tokio::net::TcpStream; @@ -280,14 +279,13 @@ impl ClientInnerCommon { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; - // initiates the auth session + // discard all cannot run in a transaction. must be executed alone. self.inner.batch_execute("discard all").await?; - self.inner - .execute( - "select auth.jwt_session_init($1)", - &[&&*token as &(dyn ToSql + Sync)], - ) - .await?; + + // initiates the auth session + // this is safe from query injections as the jwt format free of any escape characters. + let query = format!("select auth.jwt_session_init('{token}')"); + self.inner.batch_execute(&query).await?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); diff --git a/pyproject.toml b/pyproject.toml index e299c421e9..92a660c233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,12 @@ Jinja2 = "^3.1.5" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" -boto3-stubs = {extras = ["s3"], version = "^1.26.16"} +boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"} moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" -pytest-timeout = "^2.1.0" +pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" diff --git a/pytest.ini b/pytest.ini index 7197b078c6..237066b1f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,7 +11,7 @@ markers = testpaths = test_runner minversion = 6.0 -log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s +log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true timeout = 300 diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 38a7f202ba..591d60ea79 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.84.1" +channel = "1.85.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index d4f47fc96d..5c305769dd 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -5,7 +5,10 @@ use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; -use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus}; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; use std::error::Error as _; use utils::{ id::{NodeId, TenantId, TimelineId}, @@ -88,6 +91,12 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { + let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, @@ -128,7 +137,7 @@ impl Client { } pub async fn utilization(&self) -> Result { - let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint); + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; resp.json().await.map_err(Error::ReceiveBody) } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index e92ca881e1..35aebfd8ad 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -235,7 +235,7 @@ impl Storage for FileStorage { #[cfg(test)] mod test { use super::*; - use safekeeper_api::membership::{Configuration, MemberSet}; + use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; @@ -246,7 +246,7 @@ mod test { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); state.mconf = Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::empty(), new_members: None, }; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 41e30d838a..cd2ac5f44c 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -2,6 +2,7 @@ use http_utils::failpoints::failpoints_handler; use hyper::{Body, Request, Response, StatusCode}; use safekeeper_api::models; use safekeeper_api::models::AcceptorStateStatus; +use safekeeper_api::models::PullTimelineRequest; use safekeeper_api::models::SafekeeperStatus; use safekeeper_api::models::TermSwitchApiEntry; use safekeeper_api::models::TimelineStatus; @@ -230,7 +231,7 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result, ApiError> { check_permission(&request, None)?; - let data: pull_timeline::Request = json_request(&mut request).await?; + let data: PullTimelineRequest = json_request(&mut request).await?; let conf = get_conf(&request); let global_timelines = get_global_timelines(&request); diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f2d8e4c85f..4827b73074 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -4,10 +4,13 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{models::TimelineStatus, Term}; +use safekeeper_api::{ + models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, + Term, +}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use std::{ cmp::min, io::{self, ErrorKind}, @@ -33,7 +36,7 @@ use crate::{ }; use utils::{ crashsafe::fsync_async_opt, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantTimelineId}, logging::SecretString, lsn::Lsn, pausable_failpoint, @@ -378,21 +381,6 @@ impl WalResidentTimeline { } } -/// pull_timeline request body. -#[derive(Debug, Deserialize)] -pub struct Request { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub http_hosts: Vec, -} - -#[derive(Debug, Serialize)] -pub struct Response { - // Donor safekeeper host - pub safekeeper_host: String, - // TODO: add more fields? -} - /// Response for debug dump request. #[derive(Debug, Deserialize)] pub struct DebugDumpResponse { @@ -405,10 +393,10 @@ pub struct DebugDumpResponse { /// Find the most advanced safekeeper and pull timeline from it. pub async fn handle_request( - request: Request, + request: PullTimelineRequest, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -460,7 +448,7 @@ async fn pull_timeline( host: String, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -535,7 +523,7 @@ async fn pull_timeline( .load_temp_timeline(ttid, &tli_dir_path, false) .await?; - Ok(Response { + Ok(PullTimelineResponse { safekeeper_host: host, }) } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 45e19c31b6..f816f8459a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1004,7 +1004,7 @@ mod tests { use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperId}, + membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, ServerInfo, }; @@ -1303,7 +1303,7 @@ mod tests { tenant_id, timeline_id, mconf: Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::new(vec![SafekeeperId { id: NodeId(1), host: "hehe.org".to_owned(), diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 5916675c3f..fb06339604 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -295,6 +295,10 @@ impl InterpretedWalReader { let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); + // Tracks the start of the PG WAL LSN from which the current batch of + // interpreted records originated. + let mut current_batch_wal_start_lsn: Option = None; + loop { tokio::select! { // Main branch for reading WAL and forwarding it @@ -302,7 +306,7 @@ impl InterpretedWalReader { let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below")); let WalBytes { wal, - wal_start_lsn: _, + wal_start_lsn, wal_end_lsn, available_wal_end_lsn, } = match wal { @@ -315,6 +319,12 @@ impl InterpretedWalReader { } }; + // We will already have a value if the previous chunks of WAL + // did not decode into anything useful. + if current_batch_wal_start_lsn.is_none() { + current_batch_wal_start_lsn = Some(wal_start_lsn); + } + wal_decoder.feed_bytes(&wal); // Deserialize and interpret WAL records from this batch of WAL. @@ -363,7 +373,9 @@ impl InterpretedWalReader { let max_next_record_lsn = match max_next_record_lsn { Some(lsn) => lsn, - None => { continue; } + None => { + continue; + } }; // Update the current position such that new receivers can decide @@ -377,21 +389,38 @@ impl InterpretedWalReader { } } + let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap(); + // Send interpreted records downstream. Anything that has already been seen // by a shard is filtered out. let mut shard_senders_to_remove = Vec::new(); for (shard, states) in &mut self.shard_senders { for state in states { - if max_next_record_lsn <= state.next_record_lsn { - continue; - } - let shard_sender_id = ShardSenderId::new(*shard, state.sender_id); - let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); - let batch = InterpretedWalRecords { - records, - next_record_lsn: Some(max_next_record_lsn), + let batch = if max_next_record_lsn > state.next_record_lsn { + // This batch contains at least one record that this shard has not + // seen yet. + let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); + + InterpretedWalRecords { + records, + next_record_lsn: max_next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else if wal_end_lsn > state.next_record_lsn { + // All the records in this batch were seen by the shard + // However, the batch maps to a chunk of WAL that the + // shard has not yet seen. Notify it of the start LSN + // of the PG WAL chunk such that it doesn't look like a gap. + InterpretedWalRecords { + records: Vec::default(), + next_record_lsn: state.next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else { + // The shard has seen this chunk of WAL before. Skip it. + continue; }; let res = state.tx.send(Batch { @@ -403,7 +432,7 @@ impl InterpretedWalReader { if res.is_err() { shard_senders_to_remove.push(shard_sender_id); } else { - state.next_record_lsn = max_next_record_lsn; + state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn); } } } diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ebcb060e7..e5ccbb3230 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; +use std::os::fd::AsRawFd; + use crate::metrics::TrafficMetrics; use crate::SafeKeeperConf; use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; @@ -62,6 +64,7 @@ async fn handle_socket( global_timelines: Arc, ) -> Result<(), QueryError> { socket.set_nodelay(true)?; + let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr()?; // Set timeout on reading from the socket. It prevents hanged up connection @@ -107,7 +110,7 @@ async fn handle_socket( auth_pair, global_timelines, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py index a2f553d290..915eb33673 100644 --- a/scripts/generate_image_maps.py +++ b/scripts/generate_image_maps.py @@ -6,6 +6,9 @@ build_tag = os.environ["BUILD_TAG"] branch = os.environ["BRANCH"] dev_acr = os.environ["DEV_ACR"] prod_acr = os.environ["PROD_ACR"] +dev_aws = os.environ["DEV_AWS"] +prod_aws = os.environ["PROD_AWS"] +aws_region = os.environ["AWS_REGION"] components = { "neon": ["neon"], @@ -24,11 +27,11 @@ components = { registries = { "dev": [ "docker.io/neondatabase", - "369495373322.dkr.ecr.eu-central-1.amazonaws.com", + f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{dev_acr}.azurecr.io/neondatabase", ], "prod": [ - "093970136003.dkr.ecr.eu-central-1.amazonaws.com", + f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", f"{prod_acr}.azurecr.io/neondatabase", ], } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index a93bbdeaaf..08c80bc141 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -24,6 +24,7 @@ hex.workspace = true hyper0.workspace = true humantime.workspace = true itertools.workspace = true +json-structural-diff.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true @@ -34,6 +35,7 @@ reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true safekeeper_api.workspace = true safekeeper_client.workspace = true +tikv-jemallocator.workspace = true regex.workspace = true rustls-native-certs.workspace = true serde.workspace = true diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql new file mode 100644 index 0000000000..0f051d3ac3 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP listen_https_port; diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql new file mode 100644 index 0000000000..172237d477 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD listen_https_port INTEGER; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 6f110d3294..52b6110667 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -10,7 +10,10 @@ use std::{ }; use tokio_util::sync::CancellationToken; -use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; +use pageserver_api::{ + controller_api::{NodeAvailability, SkSchedulingPolicy}, + models::PageserverUtilization, +}; use thiserror::Error; use utils::{id::NodeId, logging::SecretString}; @@ -137,8 +140,13 @@ where request = self.receiver.recv() => { match request { Some(req) => { + if req.reply.is_closed() { + // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them + continue; + } let res = self.heartbeat(req.servers).await; - req.reply.send(res).unwrap(); + // Ignore the return value in order to not panic if the heartbeat function's future was cancelled + _ = req.reply.send(res); }, None => { return; } } @@ -311,6 +319,9 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask SafekeeperState::Offline, + Err(e) => { + tracing::info!( + "Marking safekeeper {} at as offline: {e}", + sk.base_url() + ); + SafekeeperState::Offline + } }; Some((*node_id, status)) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index e3e35a6303..1cc61a12e8 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -9,7 +9,10 @@ use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECON use anyhow::Context; use futures::Future; use http_utils::{ - endpoint::{self, auth_middleware, check_permission_with, request_span}, + endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, + }, error::ApiError, failpoints::failpoints_handler, json::{json_request, json_response}, @@ -54,7 +57,7 @@ pub struct HttpState { service: Arc, auth: Option>, neon_metrics: NeonMetrics, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], } impl HttpState { @@ -63,15 +66,17 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { - let allowlist_routes = ["/status", "/ready", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); Self { service, auth, neon_metrics: NeonMetrics::new(build_info), - allowlist_routes, + allowlist_routes: &[ + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ], } } } @@ -516,6 +521,24 @@ async fn handle_tenant_timeline_block_unblock_gc( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_download_heatmap_layers( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let concurrency: Option = parse_query_param(&req, "concurrency")?; + + service + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await?; + + json_response(StatusCode::OK, ()) +} + // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters // and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to // compare to, so we can just filter out our well known ID format with regexes. @@ -1398,23 +1421,26 @@ pub fn prologue_leadership_status_check_middleware< let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - enum AllowedRoutes<'a> { + enum AllowedRoutes { All, - Some(Vec<&'a str>), + Some(&'static [&'static str]), } let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, LeadershipStatus::SteppedDown => AllowedRoutes::All, - LeadershipStatus::Candidate => { - AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::Candidate => AllowedRoutes::Some(&[ + "/ready", + "/status", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]), }; - let uri = req.uri().to_string(); match allowed_routes { AllowedRoutes::All => Ok(req), - AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req), _ => { tracing::info!( "Request {} not allowed due to current leadership state", @@ -1523,7 +1549,8 @@ enum ForwardOutcome { /// Potentially forward the request to the current storage controler leader. /// More specifically we forward when: -/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 1. Request is not one of: +/// ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"] /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state /// 3. There is a leader in the database to forward to /// 4. Leader from step (3) is not the current instance @@ -1544,10 +1571,17 @@ enum ForwardOutcome { /// Hence, if we are in the edge case scenario the leader persisted in the database is the /// stepped down instance that received the request. Condition (4) above covers this scenario. async fn maybe_forward(req: Request) -> ForwardOutcome { - const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + const NOT_FOR_FORWARD: &[&str] = &[ + "/control/v1/step_down", + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]; - let uri = req.uri().to_string(); - let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + let uri = req.uri(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path()); // Fast return before trying to take any Service locks, if we will never forward anyway if !uri_for_forward { @@ -1747,7 +1781,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -1760,13 +1794,19 @@ pub fn make_router( .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) - // Non-prefixed generic endpoints (status, metrics) + // Non-prefixed generic endpoints (status, metrics, profiling) .get("/status", |r| { named_request_span(r, handle_status, RequestName("status")) }) .get("/ready", |r| { named_request_span(r, handle_ready, RequestName("ready")) }) + .get("/profile/cpu", |r| { + named_request_span(r, profile_cpu_handler, RequestName("profile_cpu")) + }) + .get("/profile/heap", |r| { + named_request_span(r, profile_heap_handler, RequestName("profile_heap")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) @@ -2078,6 +2118,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_download_heatmap_layers, + RequestName("v1_tenant_timeline_download_heatmap_layers"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index ea6bc38e89..be074d269d 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -27,6 +27,16 @@ use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -116,6 +126,10 @@ struct Cli { #[arg(long)] long_reconcile_threshold: Option, + + // Flag to use https for requests to pageserver API. + #[arg(long, default_value = "false")] + use_https_pageserver_api: bool, } enum StrictMode { @@ -311,6 +325,7 @@ async fn async_main() -> anyhow::Result<()> { address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, + use_https_pageserver_api: args.use_https_pageserver_api, }; // Validate that we can connect to the database diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index f5c2d329e0..3762d13c10 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,5 +1,6 @@ use std::{str::FromStr, time::Duration}; +use anyhow::anyhow; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, @@ -32,12 +33,16 @@ pub(crate) struct Node { listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, + // Flag from storcon's config to use https for pageserver admin API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition { impl Node { pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on") + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -82,11 +96,20 @@ impl Node { self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port + // Note: listen_https_port may change. See [`Self::need_update`] for mode details. + // && self.listen_https_port == register_req.listen_https_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port && self.availability_zone_id == register_req.availability_zone_id } + // Do we need to update an existing record in DB on this registration request? + pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool { + // listen_https_port is checked here because it may change during migration to https. + // After migration, this check may be moved to registration_match. + self.listen_https_port != register_req.listen_https_port + } + /// For a shard located on this node, populate a response object /// with this node's address information. pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { @@ -95,6 +118,7 @@ impl Node { node_id: self.id, listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } @@ -175,25 +199,34 @@ impl Node { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn new( id: NodeId, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, - ) -> Self { - Self { + use_https: bool, + ) -> anyhow::Result { + if use_https && listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id, listen_http_addr, listen_http_port, + listen_https_port, listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, availability_zone_id, + use_https, cancel: CancellationToken::new(), - } + }) } pub(crate) fn to_persistent(&self) -> NodePersistence { @@ -202,14 +235,19 @@ impl Node { scheduling_policy: self.scheduling.into(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port as i32, + listen_https_port: self.listen_https_port.map(|x| x as i32), listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, availability_zone_id: self.availability_zone_id.0.clone(), } } - pub(crate) fn from_persistent(np: NodePersistence) -> Self { - Self { + pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { + if use_https && np.listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id: NodeId(np.node_id as u64), // At startup we consider a node offline until proven otherwise. availability: NodeAvailability::Offline, @@ -217,11 +255,13 @@ impl Node { .expect("Bad scheduling policy in DB"), listen_http_addr: np.listen_http_addr, listen_http_port: np.listen_http_port as u16, + listen_https_port: np.listen_https_port.map(|x| x as u16), listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, availability_zone_id: AvailabilityZone(np.availability_zone_id), + use_https, cancel: CancellationToken::new(), - } + }) } /// Wrapper for issuing requests to pageserver management API: takes care of generic @@ -285,8 +325,9 @@ impl Node { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to node {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -302,6 +343,7 @@ impl Node { availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 141ff6f720..645cbdfce1 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -280,6 +280,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + measured_request!( + "download_heatmap_layers", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 67b60eadf3..459c11add9 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -375,18 +375,23 @@ impl Persistence { Ok(nodes) } - pub(crate) async fn update_node( + pub(crate) async fn update_node( &self, input_node_id: NodeId, - input_scheduling: NodeSchedulingPolicy, - ) -> DatabaseResult<()> { + values: V, + ) -> DatabaseResult<()> + where + V: diesel::AsChangeset + Clone + Send + Sync, + V::Changeset: diesel::query_builder::QueryFragment + Send, // valid Postgres SQL + { use crate::schema::nodes::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { + let values = values.clone(); Box::pin(async move { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) - .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .set(values) .execute(conn) .await?; Ok(updated) @@ -403,6 +408,32 @@ impl Persistence { } } + pub(crate) async fn update_node_scheduling_policy( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + scheduling_policy.eq(String::from(input_scheduling)), + ) + .await + } + + pub(crate) async fn update_node_on_registration( + &self, + input_node_id: NodeId, + input_https_port: Option, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + listen_https_port.eq(input_https_port.map(|x| x as i32)), + ) + .await + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. /// @@ -1452,6 +1483,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, pub(crate) availability_zone_id: String, + pub(crate) listen_https_port: Option, } /// Tenant metadata health status that are stored durably. diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 48f0804926..ba3946fa3e 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::{compute_hook, service}; +use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, @@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; -const DEFAULT_HEATMAP_PERIOD: &str = "60s"; +const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. @@ -880,7 +881,27 @@ impl Reconciler { self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + let diff = match observed { + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let diff = JsonDiff::diff( + &serde_json::to_value(observed.clone()).unwrap(), + &serde_json::to_value(wanted_conf.clone()).unwrap(), + false, + ); + + if let Some(json_diff) = diff.diff { + serde_json::to_string(&json_diff).unwrap_or("diff err".to_string()) + } else { + "unknown".to_string() + } + } + _ => "full".to_string(), + }; + + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}"); // Because `node` comes from a ref to &self, clone it before calling into a &mut self // function: this could be avoided by refactoring the state mutated by location_config into @@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig let mut config = config.clone(); if has_secondaries { if config.heatmap_period.is_none() { - config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD); } } else { config.heatmap_period = None; diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index be073d0cb9..53cd8a908b 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -18,12 +18,14 @@ pub struct Safekeeper { cancel: CancellationToken, listen_http_addr: String, listen_http_port: u16, + scheduling_policy: SkSchedulingPolicy, id: NodeId, availability: SafekeeperState, } impl Safekeeper { pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { + let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); Self { cancel, listen_http_addr: skp.host.clone(), @@ -31,6 +33,7 @@ impl Safekeeper { id: NodeId(skp.id as u64), skp, availability: SafekeeperState::Offline, + scheduling_policy, } } pub(crate) fn base_url(&self) -> String { @@ -46,6 +49,13 @@ impl Safekeeper { pub(crate) fn set_availability(&mut self, availability: SafekeeperState) { self.availability = availability; } + pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy { + self.scheduling_policy + } + pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { + self.scheduling_policy = scheduling_policy; + self.skp.scheduling_policy = String::from(scheduling_policy); + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries pub(crate) async fn with_client_retries( &self, @@ -102,7 +112,7 @@ impl Safekeeper { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", + "Call to safekeeper {} ({}:{}) management API", self.id, self.listen_http_addr, self.listen_http_port ), cancel, @@ -129,10 +139,8 @@ impl Safekeeper { self.id.0 ); } - self.skp = crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(), - ); + self.skp = + crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); self.listen_http_port = http_port as u16; self.listen_http_addr = host; } diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index bb494f20fa..f234ab3429 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,5 +1,8 @@ use crate::metrics::PageserverRequestLabelGroup; -use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus}; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; use safekeeper_client::mgmt_api::{Client, Result}; use utils::{ id::{NodeId, TenantId, TimelineId}, @@ -94,6 +97,19 @@ impl SafekeeperClient { ) } + #[allow(dead_code)] + pub(crate) async fn pull_timeline( + &self, + req: &PullTimelineRequest, + ) -> Result { + measured_request!( + "pull_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.pull_timeline(req).await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 106a7b2699..44936d018a 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -930,13 +930,16 @@ pub(crate) mod test_utils { NodeId(i), format!("httphost-{i}"), 80 + i as u16, + None, format!("pghost-{i}"), 5432 + i as u16, az_iter .next() .cloned() .unwrap_or(AvailabilityZone("test-az".to_string())), - ); + false, + ) + .unwrap(); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); node diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 14c30c296d..361253bd19 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -26,6 +26,7 @@ diesel::table! { listen_pg_addr -> Varchar, listen_pg_port -> Int4, availability_zone_id -> Varchar, + listen_https_port -> Nullable, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d5713d49ee..01ace40b84 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -162,6 +162,7 @@ enum TenantOperations { TimelineDetachAncestor, TimelineGcBlockUnblock, DropDetached, + DownloadHeatmapLayers, } #[derive(Clone, strum_macros::Display)] @@ -398,6 +399,8 @@ pub struct Config { pub http_service_port: i32, pub long_reconcile_threshold: Duration, + + pub use_https_pageserver_api: bool, } impl From for ApiError { @@ -814,11 +817,12 @@ impl Service { }; tracing::info!("Sending initial heartbeats..."); - let res_ps = self - .heartbeater_ps - .heartbeat(Arc::new(nodes_to_heartbeat)) - .await; - let res_sk = self.heartbeater_sk.heartbeat(all_sks).await; + // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime + const SK_TIMEOUT: Duration = Duration::from_secs(5); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + ); let mut online_nodes = HashMap::new(); if let Ok(deltas) = res_ps { @@ -836,7 +840,7 @@ impl Service { } let mut online_sks = HashMap::new(); - if let Ok(deltas) = res_sk { + if let Ok(Ok(deltas)) = res_sk { for (node_id, status) in deltas.0 { match status { SafekeeperState::Available { @@ -1030,12 +1034,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - let optimizations = self.optimize_all().await; - if optimizations == 0 { - // Run new splits only when no optimizations are pending - self.autosplit_tenants().await; - } + self.optimize_all().await; } + // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we + // must be responsive when new projects begin ingesting and reach the threshold. + self.autosplit_tenants().await; } _ = self.reconcilers_cancel.cancelled() => return } @@ -1062,8 +1065,12 @@ impl Service { locked.safekeepers.clone() }; - let res_ps = self.heartbeater_ps.heartbeat(nodes).await; - let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await; + const SK_TIMEOUT: Duration = Duration::from_secs(3); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(nodes), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + ); + if let Ok(deltas) = res_ps { let mut to_handle = Vec::default(); @@ -1165,7 +1172,7 @@ impl Service { } } } - if let Ok(deltas) = res_sk { + if let Ok(Ok(deltas)) = res_sk { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { @@ -1396,8 +1403,8 @@ impl Service { .list_nodes() .await? .into_iter() - .map(Node::from_persistent) - .collect::>(); + .map(|x| Node::from_persistent(x, config.use_https_pageserver_api)) + .collect::>>()?; let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); metrics::METRICS_REGISTRY @@ -1496,10 +1503,13 @@ impl Service { NodeId(node_id as u64), "".to_string(), 123, + None, "".to_string(), 123, AvailabilityZone("test_az".to_string()), - ); + false, + ) + .unwrap(); scheduler.node_upsert(&node); } @@ -2911,7 +2921,9 @@ impl Service { first }; - let updated_config = base.apply_patch(patch); + let updated_config = base + .apply_patch(patch) + .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?; self.set_tenant_config_and_reconcile(tenant_id, updated_config) .await } @@ -3757,6 +3769,61 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::DownloadHeatmapLayers, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = if tenant_shard_id.is_unsharded() { + TenantShardId::tenant_range(tenant_shard_id.tenant_id) + } else { + tenant_shard_id.range() + }; + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + self.tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + Ok(()) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -5847,8 +5914,10 @@ impl Service { ) .await; + #[derive(PartialEq)] enum RegistrationStatus { - Matched, + UpToDate, + NeedUpdate, Mismatched, New, } @@ -5857,7 +5926,11 @@ impl Service { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { if node.registration_match(®ister_req) { - RegistrationStatus::Matched + if node.need_update(®ister_req) { + RegistrationStatus::NeedUpdate + } else { + RegistrationStatus::UpToDate + } } else { RegistrationStatus::Mismatched } @@ -5867,9 +5940,9 @@ impl Service { }; match registration_status { - RegistrationStatus::Matched => { + RegistrationStatus::UpToDate => { tracing::info!( - "Node {} re-registered with matching address", + "Node {} re-registered with matching address and is up to date", register_req.node_id ); @@ -5887,7 +5960,7 @@ impl Service { "Node is already registered with different address".to_string(), )); } - RegistrationStatus::New => { + RegistrationStatus::New | RegistrationStatus::NeedUpdate => { // fallthrough } } @@ -5916,6 +5989,16 @@ impl Service { )); } + if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "Node {} has no https port, but use_https is enabled", + register_req.node_id + ) + .into(), + )); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -5923,13 +6006,29 @@ impl Service { register_req.node_id, register_req.listen_http_addr, register_req.listen_http_port, + register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, register_req.availability_zone_id.clone(), + self.config.use_https_pageserver_api, ); + let new_node = match new_node { + Ok(new_node) => new_node, + Err(error) => return Err(ApiError::InternalServerError(error)), + }; - // TODO: idempotency if the node already exists in the database - self.persistence.insert_node(&new_node).await?; + match registration_status { + RegistrationStatus::New => self.persistence.insert_node(&new_node).await?, + RegistrationStatus::NeedUpdate => { + self.persistence + .update_node_on_registration( + register_req.node_id, + register_req.listen_https_port, + ) + .await? + } + _ => unreachable!("Other statuses have been processed earlier"), + } let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); @@ -5944,12 +6043,24 @@ impl Service { .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); - tracing::info!( - "Registered pageserver {} ({}), now have {} pageservers", - register_req.node_id, - register_req.availability_zone_id, - locked.nodes.len() - ); + match registration_status { + RegistrationStatus::New => { + tracing::info!( + "Registered pageserver {} ({}), now have {} pageservers", + register_req.node_id, + register_req.availability_zone_id, + locked.nodes.len() + ); + } + RegistrationStatus::NeedUpdate => { + tracing::info!( + "Re-registered and updated node {} ({})", + register_req.node_id, + register_req.availability_zone_id, + ); + } + _ => unreachable!("Other statuses have been processed earlier"), + } Ok(()) } @@ -5967,7 +6078,9 @@ impl Service { if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory - self.persistence.update_node(node_id, scheduling).await?; + self.persistence + .update_node_scheduling_policy(node_id, scheduling) + .await?; } // If we're activating a node, then before setting it active we must reconcile any shard locations @@ -6538,11 +6651,12 @@ impl Service { ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); - match reconcile_needed { + let reconcile_reason = match reconcile_needed { ReconcileNeeded::No => return None, ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), - ReconcileNeeded::Yes => { + ReconcileNeeded::Yes(reason) => { // Fall through to try and acquire units for spawning reconciler + reason } }; @@ -6581,6 +6695,7 @@ impl Service { }; shard.spawn_reconciler( + reconcile_reason, &self.result_tx, nodes, &self.compute_hook, @@ -7905,7 +8020,7 @@ impl Service { let sk = safekeepers .get_mut(&node_id) .ok_or(DatabaseError::Logical("Not found".to_string()))?; - sk.skp.scheduling_policy = String::from(scheduling_policy); + sk.set_scheduling_policy(scheduling_policy); locked.safekeepers = Arc::new(safekeepers); } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 219c0dffe7..56a36dc2df 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded { /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] - Yes, + Yes(ReconcileReason), +} + +#[derive(Debug)] +pub(crate) enum ReconcileReason { + ActiveNodesDirty, + UnknownLocation, + PendingComputeNotification, } /// Pending modification to the observed state of a tenant shard. @@ -1341,12 +1348,18 @@ impl TenantShard { let active_nodes_dirty = self.dirty(pageservers); - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - let do_reconcile = - active_nodes_dirty || dirty_observed || self.pending_compute_notification; + let reconcile_needed = match ( + active_nodes_dirty, + dirty_observed, + self.pending_compute_notification, + ) { + (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty), + (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation), + (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification), + _ => ReconcileNeeded::No, + }; - if !do_reconcile { + if matches!(reconcile_needed, ReconcileNeeded::No) { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } @@ -1389,7 +1402,7 @@ impl TenantShard { } } - ReconcileNeeded::Yes + reconcile_needed } /// Ensure the sequence number is set to a value where waiting for this value will make us wait @@ -1479,6 +1492,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, + reason: ReconcileReason, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, @@ -1538,7 +1552,7 @@ impl TenantShard { let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; - tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})"); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py index 33248132ab..d674be99de 100644 --- a/test_runner/fixtures/fast_import.py +++ b/test_runner/fixtures/fast_import.py @@ -4,8 +4,10 @@ import subprocess import tempfile from collections.abc import Iterator from pathlib import Path +from typing import cast import pytest +from _pytest.config import Config from fixtures.log_helper import log from fixtures.neon_cli import AbstractNeonCli @@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli): pg_distrib_dir: Path, pg_version: PgVersion, workdir: Path, + cleanup: bool = True, ): if extra_env is None: env_vars = {} @@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli): if not workdir.exists(): raise Exception(f"Working directory '{workdir}' does not exist") self.workdir = workdir + self.cleanup = cleanup + + def run_pgdata( + self, + s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + interactive: bool = False, + ): + return self.run( + "pgdata", + s3prefix=s3prefix, + pg_port=pg_port, + source_connection_string=source_connection_string, + interactive=interactive, + ) + + def run_dump_restore( + self, + s3prefix: str | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, + ): + return self.run( + "dump-restore", + s3prefix=s3prefix, + source_connection_string=source_connection_string, + destination_connection_string=destination_connection_string, + ) def run( self, - pg_port: int, - source_connection_string: str | None = None, + command: str, s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, interactive: bool = False, ) -> subprocess.CompletedProcess[str]: if self.cmd is not None: @@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli): args = [ f"--pg-bin-dir={self.pg_bin}", f"--pg-lib-dir={self.pg_lib}", - f"--pg-port={pg_port}", f"--working-directory={self.workdir}", ] - if source_connection_string is not None: - args.append(f"--source-connection-string={source_connection_string}") if s3prefix is not None: args.append(f"--s3-prefix={s3prefix}") + args.append(command) + if pg_port is not None: + args.append(f"--pg-port={pg_port}") + if source_connection_string is not None: + args.append(f"--source-connection-string={source_connection_string}") + if destination_connection_string is not None: + args.append(f"--destination-connection-string={destination_connection_string}") if interactive: args.append("--interactive") @@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli): return self def __exit__(self, *args): - if self.workdir.exists(): + if self.workdir.exists() and self.cleanup: shutil.rmtree(self.workdir) @@ -87,9 +125,17 @@ def fast_import( test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, + pytestconfig: Config, ) -> Iterator[FastImport]: - workdir = Path(tempfile.mkdtemp()) - with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi: + workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_")) + with FastImport( + None, + neon_binpath, + pg_distrib_dir, + pg_version, + workdir, + cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")), + ) as fi: yield fi if fi.cmd is None: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 469bc8a1e5..1d282971b1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse import asyncpg import backoff +import boto3 import httpx import psycopg2 import psycopg2.sql @@ -37,6 +38,8 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from jwcrypto import jwk +from mypy_boto3_kms import KMSClient +from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -93,7 +96,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, USE_LFC, - allure_add_grafana_links, + allure_add_grafana_link, assert_no_errors, get_dir_size, print_gc_result, @@ -199,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: mock_s3_server.kill() +@pytest.fixture(scope="session") +def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]: + yield boto3.client( + "kms", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + +@pytest.fixture(scope="session") +def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]: + yield boto3.client( + "s3", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + class PgProtocol: """Reusable connection logic""" @@ -464,6 +491,7 @@ class NeonEnvBuilder: self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: assert ( self.compatibility_neon_binpath is not None @@ -675,6 +703,11 @@ class NeonEnvBuilder: def _mix_versions(self): assert self.version_combination is not None, "version combination must be set" + + # Always use a newer version of `neon_local` + (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local") + self.neon_local_binpath = self.mixdir + for component, paths in COMPONENT_BINARIES.items(): directory = ( self.neon_binpath @@ -683,10 +716,11 @@ class NeonEnvBuilder: ) for filename in paths: destination = self.mixdir / filename - destination.symlink_to(directory / filename) + destination.hardlink_to(directory / filename) + self.neon_binpath = self.mixdir + if self.version_combination["compute"] == "old": self.pg_distrib_dir = self.compatibility_pg_distrib_dir - self.neon_binpath = self.mixdir def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ @@ -1133,15 +1167,15 @@ class NeonEnv: "max_batch_size": 32, } - # Concurrent IO (https://github.com/neondatabase/neon/issues/9378): - # enable concurrent IO by default in tests and benchmarks. - # Compat tests are exempt because old versions fail to parse the new config. - get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if config.test_may_use_compatibility_snapshot_binaries: log.info( - "Forcing use of binary-built-in default to avoid forward-compatibility related test failures" + "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" ) - get_vectored_concurrent_io = None + else: + # Look for gaps in WAL received from safekeepeers + ps_cfg["validate_wal_contiguity"] = True + + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io if get_vectored_concurrent_io is not None: ps_cfg["get_vectored_concurrent_io"] = { "mode": self.pageserver_get_vectored_concurrent_io, @@ -1596,6 +1630,7 @@ def neon_env_builder( class PageserverPort: pg: int http: int + https: int | None = None class LogUtils: @@ -1852,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_https_port": node.service_port.https, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, "availability_zone_id": node.az_id, @@ -2433,6 +2469,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + def __enter__(self) -> Self: return self @@ -3213,7 +3257,7 @@ def remote_pg( end_ms = int(datetime.utcnow().timestamp() * 1000) if is_neon: # Add 10s margin to the start and end times - allure_add_grafana_links( + allure_add_grafana_link( host, timeline_id, start_ms - 10_000, diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index e160c617cd..84d62fb877 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -52,11 +52,11 @@ COMPONENT_BINARIES = { # Disable auto-formatting for better readability # fmt: off VERSIONS_COMBINATIONS = ( - {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo ) # fmt: on @@ -64,6 +64,8 @@ VERSIONS_COMBINATIONS = ( # If it is not set or set to a value not equal to "false", LFC is enabled by default. USE_LFC = os.environ.get("USE_LFC") != "false" +WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled" + def subprocess_capture( capture_dir: Path, @@ -310,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): GRAFANA_URL = "https://neonprod.grafana.net" -GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore" -GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector" -LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz" +GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging" -def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): - """Add links to server logs in Grafana to Allure report""" - links: dict[str, str] = {} - # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build +def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): + """ + Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report. + + Args: + host (str): The host string in the format 'ep-..'. + timeline_id (TimelineId): The timeline identifier for the Grafana dashboard. + (currently ignored but may be needed in future verions of the dashboard) + start_ms (int): The start time in milliseconds for the Grafana dashboard. + end_ms (int): The end time in milliseconds for the Grafana dashboard. + + Example: + Given + host = '' + timeline_id = '996926d1f5ddbe7381b8840083f8fc9a' + + The generated link would be something like: + https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi + + """ + # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) - expressions = { - "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', - "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', - "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', - "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + params = { + "orgId": 1, + "from": start_ms, + "to": end_ms, + "timezone": "utc", + "var-env": "dev|staging", + "var-input_endpoint_id": endpoint_id, } - params: dict[str, Any] = { - "datasource": LOGS_STAGING_DATASOURCE_ID, - "queries": [ - { - "expr": "", - "refId": "A", - "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID}, - "editorMode": "code", - "queryType": "range", - } - ], - "range": { - "from": str(start_ms), - "to": str(end_ms), - }, - } - for name, expr in expressions.items(): - params["queries"][0]["expr"] = expr - query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) - links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}" + query_string = urlencode(params) + link = f"{GRAFANA_DASHBOARD_URL}?{query_string}" - timeline_qs = urlencode( - { - "orgId": 1, - "var-environment": "victoria-metrics-aws-dev", - "var-timeline_id": timeline_id, - "var-endpoint_id": endpoint_id, - "var-log_datasource": "grafanacloud-neonstaging-logs", - "from": start_ms, - "to": end_ms, - } - ) - link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}" - links["Timeline Inspector"] = link - - for name, link in links.items(): - allure.dynamic.link(link, name=name) - log.info(f"{name}: {link}") + allure.dynamic.link(link, name="Cross-Service Endpoint Debugging") + log.info(f"Cross-Service Endpoint Debugging: {link}") def start_in_background( diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 124e62999a..d49686b57c 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*failed to load metadata.*", ".*load failed.*load local timeline.*", ".*: layer load failed, assuming permanent failure:.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) @@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match="failed to get checkpoint bytes") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f10872590c..c091cd0869 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) @@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent( workload.churn_rows(row_count, env.pageserver.id) env.create_branch("child_branch") # so that we have a retain_lsn workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch_2") # so that we have another retain_lsn + workload.churn_rows(row_count, env.pageserver.id) # compact 3 times if mode is before_restart n_compactions = 3 if compaction_mode == "before_restart" else 1 ps_http.timeline_compact( @@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) @@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively, # and the second one should have hit the duplicated layer key warning. @@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index 0217cd0d03..03bfd1cb8d 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -2,6 +2,8 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import PgVersion +from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres @pytest.mark.parametrize( @@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") with pytest.raises(Exception, match="This probably means the server terminated abnormally"): endpoint.safe_psql(f"SELECT {sql_func}();") + + +@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only") +def test_sanitizers(neon_env_builder: NeonEnvBuilder): + """ + Test that undefined behavior leads to endpoint abort with sanitizers enabled + """ + env = neon_env_builder.init_start() + env.create_branch("test_ubsan") + endpoint = env.endpoints.create_start("test_ubsan") + + # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org + if not WITH_SANITIZERS: + endpoint.safe_psql("SELECT 1::int4 << 128") + else: + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql("SELECT 1::int4 << 128") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index ea86eb62eb..71e0d16edd 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -1,7 +1,9 @@ +import base64 import json import re import time from enum import Enum +from pathlib import Path import psycopg2 import psycopg2.errors @@ -14,8 +16,12 @@ from fixtures.pageserver.http import ( ImportPgdataIdemptencyKey, PageserverApiException, ) +from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from mypy_boto3_kms import KMSClient +from mypy_boto3_kms.type_defs import EncryptResponseTypeDef +from mypy_boto3_s3 import S3Client from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -103,13 +109,15 @@ def test_pgdata_import_smoke( while True: relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") log.info( - f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages" ) if relblock_size >= target_relblock_size: break addrows = int((target_relblock_size - relblock_size) // 8192) assert addrows >= 1, "forward progress" - vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + vanilla_pg.safe_psql( + f"insert into t select generate_series({nrows + 1}, {nrows + addrows})" + ) nrows += addrows expect_nrows = nrows expect_sum = ( @@ -332,6 +340,224 @@ def test_pgdata_import_smoke( br_initdb_endpoint.safe_psql("select * from othertable") +def test_fast_import_with_pageserver_ingest( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, + neon_env_builder: NeonEnvBuilder, + make_httpserver: HTTPServer, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Setup pageserver and fake cplane for import progress + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api", + # because import_pgdata code uses this endpoint, not the one in common remote storage config + # TODO: maybe use common remote_storage config in pageserver? + "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(), + } + ) + env.pageserver.stop() + env.pageserver.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "project_id": "someproject", + "branch_id": "somebranch", + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Create timeline with import_pgdata + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": { + "AwsS3": { + "region": env.s3_mock_server.region(), + "bucket": bucket, + "key": key_prefix, + } + }, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + pg_port = port_distributor.get_port() + fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") + vanilla_pg.stop() + + def validate_vanilla_equivalence(ep): + res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb") + assert res[0] == (10, 55), f"got result: {res}" + + # Sanity check that data in pgdata is expected: + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + validate_vanilla_equivalence(conn) + + # Poll pageserver statuses in s3 + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}") + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + if state == "Active": + key = f"{key_prefix}/status/shard-{shard_id.shard_index}" + shard_status_file_contents = ( + mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8") + ) + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(0.5) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id) + + # check that data is there + validate_vanilla_equivalence(ep) + + # check that we can do basic ops + + ep.safe_psql("create table othertable(values text)", dbname="neondb") + rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + ep.stop() + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_tip_endpoint.stop() + + # ... at the initdb lsn + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero["shard_id"], timeline_id + ) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_initdb_endpoint.stop() + + env.pageserver.stop(immediate=True) + + def test_fast_import_binary( test_output_dir, vanilla_pg: VanillaPostgres, @@ -342,7 +568,7 @@ def test_fast_import_binary( vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") pg_port = port_distributor.get_port() - fast_import.run(pg_port, vanilla_pg.connstr()) + fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) vanilla_pg.stop() pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) @@ -358,6 +584,118 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_restore_to_connstring( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # create another database & role and try to restore there + destination_vanilla_pg.safe_psql(""" + CREATE ROLE testrole WITH + LOGIN + PASSWORD 'testpassword' + NOSUPERUSER + NOCREATEDB + NOCREATEROLE; + """) + destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;") + + destination_connstring = destination_vanilla_pg.connstr( + dbname="testdb", user="testrole", password="testpassword" + ) + fast_import.run_dump_restore( + source_connection_string=vanilla_pg.connstr(), + destination_connection_string=destination_connstring, + ) + vanilla_pg.stop() + conn = PgProtocol(dsn=destination_connstring) + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +def test_fast_import_restore_to_connstring_from_s3_spec( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Start target postgres + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "destination_connstring_ciphertext_base64": base64.b64encode( + destination_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + } + + mock_s3_client.create_bucket(Bucket="test-bucket") + mock_s3_client.put_object( + Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec) + ) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix") + vanilla_pg.stop() + + res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + # TODO: Maybe test with pageserver? # 1. run whole neon env # 2. create timeline with some s3 path??? diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 8762e6525b..ea7d38a3d9 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): thread.join() + # Fill LFC: seqscan should fetch the whole table in cache. + # It is needed for further correct evaluation of LFC file size + # (a sparse chunk of LFC takes less than 1 MB on disk). + cur.execute("select sum(abalance) from pgbench_accounts") + # Before shrinking the cache, check that it really is large now (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert int(lfc_file_blocks) > 128 * 1024 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8a91a255d8..602d493ae6 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -903,6 +903,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): remote_storage_kind=RemoteStorageKind.MOCK_S3, ) + tenant_conf = TENANT_CONF.copy() + tenant_conf["heatmap_period"] = "0s" + env = neon_env_builder.init_configs() env.start() @@ -910,7 +913,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}') + env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}') env.storage_controller.reconcile_until_idle() @@ -974,12 +977,22 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # The new layer map should contain all the layers in the pre-migration one # and a new in memory layer - assert len(heatmap_before_migration["timelines"][0]["layers"]) + 1 == len( - heatmap_after_migration["timelines"][0]["layers"] + after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"]) + assert ( + len(heatmap_before_migration["timelines"][0]["layers"]) + 1 + == after_migration_heatmap_layers_count ) - log.info( - f'Heatmap size after cold migration is {len(heatmap_after_migration["timelines"][0]["layers"])}' + log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id ) - # TODO: Once we have an endpoint for rescuing the cold location, exercise it here. + def all_layers_downloaded(): + local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == after_migration_heatmap_layers_count + + wait_until(all_layers_downloaded) diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py new file mode 100644 index 0000000000..c31e5ef7f8 --- /dev/null +++ b/test_runner/regress/test_pgstat.py @@ -0,0 +1,83 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion + + +# +# Test that pgstat statistic is preserved across sessions +# +def test_pgstat(neon_simple_env: NeonEnv): + env = neon_simple_env + if env.pg_version == PgVersion.V14: + pytest.skip("PG14 doesn't support pgstat statistic persistence") + + n = 10000 + endpoint = env.endpoints.create_start( + "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"] + ) + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("create table t(x integer)") + cur.execute(f"insert into t values (generate_series(1,{n}))") + cur.execute("vacuum analyze t") + cur.execute("select sum(x) from t") + cur.execute("update t set x=x+1") + + cur.execute("select pg_stat_force_next_flush()") + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + cur.execute("update t set x=x+1") + + # stop without checkpoint + endpoint.stop(mode="immediate") + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information should be discarded in case of abnormal termination + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) + + cur.execute("select sum(x) from t") + + # create more relations to increase size of statistics + for i in range(1, 1000): + cur.execute(f"create table t{i}(pk integer primary key)") + + cur.execute("select pg_stat_force_next_flush()") + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information is not restored because its size exeeds 100k threshold + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py new file mode 100644 index 0000000000..3e29c92a96 --- /dev/null +++ b/test_runner/regress/test_relations.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) + + +def test_pageserver_reldir_v2( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "false", + } + ) + + endpoint = env.endpoints.create_start("main") + # Create a relation in v1 + endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + + # Switch to v2 + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": True, + }, + ) + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Restart the endpoint + endpoint.stop() + endpoint.start() + + # Check if both relations are still accessible again after restart + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Create a relation in v2 + endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + # Delete a relation in v1 + endpoint.safe_psql("DROP TABLE foo1") + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + # Restart the endpoint + endpoint.stop() + # This will acquire a basebackup, which lists all relations. + endpoint.start() + + # Check if both relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + endpoint.safe_psql("DROP TABLE foo3") + endpoint.stop() + endpoint.start() + + # Check if relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("DROP TABLE IF EXISTS foo3") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 88d30308f7..d18cbb3393 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3238,12 +3238,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): newest_info = target.get_safekeeper(inserted["id"]) assert newest_info assert newest_info["scheduling_policy"] == "Pause" - target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + target.safekeeper_scheduling_policy(inserted["id"], "Active") newest_info = target.get_safekeeper(inserted["id"]) assert newest_info - assert newest_info["scheduling_policy"] == "Decomissioned" + assert newest_info["scheduling_policy"] == "Active" # Ensure idempotency - target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + target.safekeeper_scheduling_policy(inserted["id"], "Active") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + # change back to paused again + target.safekeeper_scheduling_policy(inserted["id"], "Pause") def storcon_heartbeat(): assert env.storage_controller.log_contains( @@ -3252,6 +3257,9 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): wait_until(storcon_heartbeat) + # Now decomission it + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] @@ -3756,3 +3764,96 @@ def test_storage_controller_node_flap_detach_race( assert len(locs) == 1, f"{shard} has {len(locs)} attached locations" wait_until(validate_locations, timeout=10) + + +def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): + """ + Check that storage controller handles node_register requests with updated fields correctly. + 1. Run storage controller and register 1 pageserver without https port. + 2. Register the same pageserver with https port. Check that port has been updated. + 3. Restart the storage controller. Check that https port is persistent. + 4. Register the same pageserver without https port again (rollback). Check that port has been removed. + """ + neon_env_builder.num_pageservers = 1 + env = neon_env_builder.init_configs() + + env.storage_controller.start() + env.storage_controller.wait_until_ready() + + pageserver = env.pageservers[0] + + # Step 1. Register pageserver without https port. + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + # Step 2. Register pageserver with https port. + pageserver.service_port.https = 1234 + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 3. Restart storage controller. + env.storage_controller.stop() + env.storage_controller.start() + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 4. Register pageserver with no https port again. + pageserver.service_port.https = None + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + +def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder): + """ + Validate that a storage controller restart with no shards in a transient state + performs zero reconciliations at start-up. Implicitly, this means that the location + configs returned by the pageserver are identical to the persisted state in the + storage controller database. + """ + neon_env_builder.num_pageservers = 1 + neon_env_builder.storage_controller_config = { + "start_as_candidate": False, + } + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"} + ) + + env.storage_controller.reconcile_until_idle() + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_before_restart != 0 + + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.reconcile_until_idle() + + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_after_restart == 0 diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 849d4f024d..6175643389 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -1,9 +1,10 @@ from __future__ import annotations +import threading import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync from fixtures.utils import query_scalar, wait_until @@ -239,3 +240,173 @@ def test_subscriber_branching(neon_simple_env: NeonEnv): res = scur_postgres.fetchall() assert len(res) == 1 assert str(sub_child_2_timeline_id) == res[0][0] + + +def test_multiple_subscription_branching(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases + """ + env = neon_simple_env + + NUMBER_OF_DBS = 5 + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_replication_slots = 10", + "max_logical_replication_workers=10", + "max_worker_processes=10", + ], + ) + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + for i in range(NUMBER_OF_DBS): + TEST_DB_NAMES.append( + { + "name": f"db{i}", + "owner": "cloud_admin", + } + ) + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + + # create table, replication and subscription for each of the databases + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + publisher_cursor.execute(f"CREATE TABLE t{i}(a int)") + publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}") + publisher_cursor.execute( + f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');" + ) + publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})") + + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"CREATE TABLE t{i}(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i} WITH (create_slot = false) " + ) + + # wait for the subscription to be active + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # Check that replication is working + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") + + def start_publisher_workload(table_num: int, duration: int): + start = time.time() + with endpoint.cursor(dbname="publisher_db") as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)") + + LOAD_DURATION = 5 + threads = [ + threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION)) + for i in range(NUMBER_OF_DBS) + ] + + for thread in threads: + thread.start() + + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="main", + ancestor_start_lsn=last_insert_lsn, + ) + + sub_child_1 = env.endpoints.create("subscriber_child_1") + + sub_child_1.respec( + skip_pg_catalog_updates=False, + reconfigure_concurrency=5, + drop_subscriptions_before_start=True, + cluster={ + "databases": TEST_DB_NAMES, + "roles": [], + }, + ) + + sub_child_1.start() + + # ensure that subscription deletion happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + log.info(f"res = {res}") + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # ensure that there are no subscriptions in the databases + for i in range(NUMBER_OF_DBS): + with sub_child_1.cursor(dbname=f"db{i}") as cursor: + cursor.execute("SELECT * FROM pg_catalog.pg_subscription") + res = cursor.fetchall() + assert len(res) == 0 + + # ensure that there are no unexpected rows in the tables + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + for thread in threads: + thread.join() + + # ensure that logical replication is still working in main endpoint + # wait for it to catch up + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # verify that the data is the same in publisher and subscriber tables + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + publisher_cursor.execute(f"SELECT count(*) FROM t{i}") + cursor.execute(f"SELECT count(*) FROM t{i}") + pub_res = publisher_cursor.fetchone() + sub_res = cursor.fetchone() + log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}") + assert pub_res == sub_res diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index b4c968b217..afe444f227 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): counts = timeline_detail["directory_entries_counts"] assert counts log.info(f"directory counts: {counts}") - assert counts[2] > COUNT_AT_LEAST_EXPECTED + # We need to add up reldir v1 + v2 counts + assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 2706ddf2f0..c17840d31c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -823,6 +823,8 @@ def test_timeline_retain_lsn( [ ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*", ".*page_service_conn_main.*could not find data for key.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) if offload_child is None or "no-restart" not in offload_child: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 21b2ad479c..c5045fe4a4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1445,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # roughly fills one segment endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'") + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) endpoint.stop() # stop compute @@ -1473,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): "flush_lsn to get aligned", ) - cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) + sk1_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + sk2_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + assert sk1_digest == sk2_digest # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 62a86dfc91..6254ab9b44 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 62a86dfc91e0c35a72f2ea5e99e6969b830c0c26 +Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 80ed91ce25..6ff5044377 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 80ed91ce255c765d25be0bb4a02c942fe6311fbf +Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 999cf81b10..261ed10e9b 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 999cf81b101ead40e597d5cd729458d8200f4537 +Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 4d3a722312..59b2fe851f 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 4d3a722312b496ff7378156caa6d41c2e70c30e4 +Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f diff --git a/vendor/revisions.json b/vendor/revisions.json index 888f09124e..f85cec3a0b 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.3", - "4d3a722312b496ff7378156caa6d41c2e70c30e4" + "17.4", + "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f" ], "v16": [ - "16.7", - "999cf81b101ead40e597d5cd729458d8200f4537" + "16.8", + "261ed10e9b8c8dda01ad7aefb18e944e30aa161d" ], "v15": [ - "15.11", - "80ed91ce255c765d25be0bb4a02c942fe6311fbf" + "15.12", + "6ff50443773b69749e16da6db9d4f4b19064b4b7" ], "v14": [ - "14.16", - "62a86dfc91e0c35a72f2ea5e99e6969b830c0c26" + "14.17", + "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" ] }