diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3a6fbf4234..3740e6dc9c 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -20,9 +20,14 @@ on: required: true type: string test-cfg: - description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on' + description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string defaults: run: @@ -48,8 +53,6 @@ jobs: # io_uring will account the memory of the CQ and SQ as locked. # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 - strategy: - matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }} env: BUILD_TYPE: ${{ inputs.build-type }} GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} @@ -89,7 +92,7 @@ jobs: - name: Set env variables env: ARCH: ${{ inputs.arch }} - SANITIZERS: ${{ matrix.sanitizers }} + SANITIZERS: ${{ inputs.sanitizers }} run: | CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then @@ -167,7 +170,7 @@ jobs: - name: Run cargo build env: - WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }} + WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }} run: | export ASAN_OPTIONS=detect_leaks=0 ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS} @@ -177,7 +180,7 @@ jobs: - name: Install rust binaries env: ARCH: ${{ inputs.arch }} - SANITIZERS: ${{ matrix.sanitizers }} + SANITIZERS: ${{ inputs.sanitizers }} run: | # Install target binaries mkdir -p /tmp/neon/bin/ @@ -225,7 +228,7 @@ jobs: role-duration-seconds: 18000 # 5 hours - name: Run rust tests - if: ${{ matrix.sanitizers != 'enabled' }} + if: ${{ inputs.sanitizers != 'enabled' }} env: NEXTEST_RETRIES: 3 run: | @@ -287,6 +290,7 @@ jobs: DATABASE_URL: postgresql://localhost:1235/storage_controller POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install run: | + export ASAN_OPTIONS=detect_leaks=0 /tmp/neon/bin/neon_local init /tmp/neon/bin/neon_local storage_controller start @@ -333,7 +337,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }} + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress @@ -344,6 +348,10 @@ jobs: rerun_failed: true pg_version: ${{ matrix.pg_version }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. + # Attempt to stop tests gracefully to generate test reports + # until they are forcibly stopped by the stricter `timeout-minutes` limit. + extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty @@ -351,7 +359,7 @@ jobs: PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} - SANITIZERS: ${{ matrix.sanitizers }} + SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml deleted file mode 100644 index c304172ff7..0000000000 --- a/.github/workflows/_push-to-acr.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Push images to ACR -on: - workflow_call: - inputs: - client_id: - description: Client ID of Azure managed identity or Entra app - required: true - type: string - image_tag: - description: Tag for the container image - required: true - type: string - images: - description: Images to push - required: true - type: string - registry_name: - description: Name of the container registry - required: true - type: string - subscription_id: - description: Azure subscription ID - required: true - type: string - tenant_id: - description: Azure tenant ID - required: true - type: string - -jobs: - push-to-acr: - runs-on: ubuntu-22.04 - permissions: - contents: read # This is required for actions/checkout - id-token: write # This is required for Azure Login to work. - - steps: - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ inputs.client_id }} - subscription-id: ${{ inputs.subscription_id }} - tenant-id: ${{ inputs.tenant_id }} - - - name: Login to ACR - run: | - az acr login --name=${{ inputs.registry_name }} - - - name: Copy docker images to ACR ${{ inputs.registry_name }} - run: | - images='${{ inputs.images }}' - for image in ${images}; do - docker buildx imagetools create \ - -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ - neondatabase/${image}:${{ inputs.image_tag }} - done diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml new file mode 100644 index 0000000000..3c97c8a67a --- /dev/null +++ b/.github/workflows/_push-to-container-registry.yml @@ -0,0 +1,101 @@ +name: Push images to Container Registry +on: + workflow_call: + inputs: + # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + image-map: + description: JSON map of images, mapping from a source image to an array of target images that should be pushed. + required: true + type: string + aws-region: + description: AWS region to log in to. Required when pushing to ECR. + required: false + type: string + aws-account-ids: + description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + azure-client-id: + description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR. + required: false + type: string + azure-subscription-id: + description: Azure subscription ID. Required when pushing to ACR. + required: false + type: string + azure-tenant-id: + description: Azure tenant ID. Required when pushing to ACR. + required: false + type: string + acr-registry-name: + description: ACR registry name. Required when pushing to ACR. + required: false + type: string + secrets: + docker-hub-username: + description: Docker Hub username. Required when pushing to Docker Hub. + required: false + docker-hub-password: + description: Docker Hub password. Required when pushing to Docker Hub. + required: false + aws-role-to-assume: + description: AWS role to assume. Required when pushing to ECR. + required: false + +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + push-to-container-registry: + runs-on: ubuntu-22.04 + permissions: + id-token: write # Required for aws/azure login + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: scripts/push_with_image_map.py + sparse-checkout-cone-mode: false + + - name: Print image-map + run: echo '${{ inputs.image-map }}' | jq + + - name: Configure AWS credentials + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: "${{ inputs.aws-region }}" + role-to-assume: "${{ secrets.aws-role-to-assume }}" + role-duration-seconds: 3600 + + - name: Login to ECR + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/amazon-ecr-login@v2 + with: + registries: "${{ inputs.aws-account-ids }}" + + - name: Configure Azure credentials + if: contains(inputs.image-map, 'azurecr.io/') + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ inputs.azure-client-id }} + subscription-id: ${{ inputs.azure-subscription-id }} + tenant-id: ${{ inputs.azure-tenant-id }} + + - name: Login to ACR + if: contains(inputs.image-map, 'azurecr.io/') + run: | + az acr login --name=${{ inputs.acr-registry-name }} + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.docker-hub-username }} + password: ${{ secrets.docker-hub-password }} + + - name: Copy docker images to target registries + run: python scripts/push_with_image_map.py + env: + IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5a4bdecb99..bc773600ea 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -263,8 +263,9 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') - needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs + if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled()) + needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -497,7 +498,7 @@ jobs: trigger-e2e-tests: if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, promote-images-dev, tag ] + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit @@ -571,21 +572,6 @@ jobs: neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Push multi-arch image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }} - compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, tag ] permissions: @@ -632,16 +618,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -729,21 +705,6 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, large ] @@ -876,133 +837,109 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down - promote-images-dev: - needs: [ check-permissions, tag, vm-compute-node-image, neon-image ] + generate-image-maps: + needs: [ tag ] runs-on: ubuntu-22.04 - - permissions: - id-token: write # aws-actions/configure-aws-credentials - statuses: write - contents: read - - env: - VERSIONS: v14 v15 v16 v17 - + outputs: + neon-dev: ${{ steps.generate.outputs.neon-dev }} + neon-prod: ${{ steps.generate.outputs.neon-prod }} + compute-dev: ${{ steps.generate.outputs.compute-dev }} + compute-prod: ${{ steps.generate.outputs.compute-prod }} steps: - - uses: docker/login-action@v3 + - uses: actions/checkout@v4 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + sparse-checkout: scripts/generate_image_maps.py + sparse-checkout-cone-mode: false - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 + - name: Generate Image Maps + id: generate + run: python scripts/generate_image_maps.py + env: + BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" + PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Copy vm-compute-node images to ECR - run: | - for version in ${VERSIONS}; do - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - - promote-images-prod: - needs: [ check-permissions, tag, test-images, promote-images-dev ] - runs-on: ubuntu-22.04 - if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - - permissions: - id-token: write # aws-actions/configure-aws-credentials - statuses: write - contents: read - - env: - VERSIONS: v14 v15 v16 v17 - - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - name: Add latest tag to images - if: github.ref_name == 'main' - run: | - for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do - docker buildx imagetools create -t $repo/neon:latest \ - $repo/neon:${{ needs.tag.outputs.build-tag }} - - for version in ${VERSIONS}; do - docker buildx imagetools create -t $repo/compute-node-${version}:latest \ - $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} - - docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ - $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - done - docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - - - name: Configure AWS-prod credentials - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - mask-aws-account-id: true - role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }} - - - name: Login to prod ECR - uses: docker/login-action@v3 - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - with: - registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com - - - name: Copy all images to prod ECR - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - run: | - for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do - docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} - done - - push-to-acr-dev: - if: github.ref_name == 'main' - needs: [ tag, promote-images-dev ] - uses: ./.github/workflows/_push-to-acr.yml + push-neon-image-dev: + needs: [ generate-image-maps, neon-image ] + uses: ./.github/workflows/_push-to-container-registry.yml with: - client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' + aws-region: eu-central-1 + aws-account-ids: "369495373322" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - push-to-acr-prod: + push-compute-image-dev: + needs: [ generate-image-maps, vm-compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' + aws-region: eu-central-1 + aws-account-ids: "369495373322" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + push-neon-image-prod: if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ tag, promote-images-prod ] - uses: ./.github/workflows/_push-to-acr.yml + needs: [ generate-image-maps, neon-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml with: - client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' + aws-region: eu-central-1 + aws-account-ids: "093970136003" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + push-compute-image-prod: + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + needs: [ generate-image-maps, vm-compute-node-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' + aws-region: eu-central-1 + aws-account-ids: "093970136003" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + # This is a bit of a special case so we're not using a generated image map. + add-latest-tag-to-neon-extensions-test-image: + if: github.ref_name == 'main' + needs: [ tag, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + } + secrets: + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] @@ -1084,7 +1021,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() permissions: @@ -1337,7 +1274,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ] + needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: @@ -1362,7 +1299,8 @@ jobs: - check-codestyle-rust - check-dependencies-rust - files-changed - - promote-images-dev + - push-compute-image-dev + - push-neon-image-dev - test-images - trigger-custom-extensions-build-and-wait runs-on: ubuntu-22.04 @@ -1379,6 +1317,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.promote-images-dev.result == 'skipped' + || needs.push-compute-image-dev.result == 'skipped' + || needs.push-neon-image-dev.result == 'skipped' || needs.test-images.result == 'skipped' || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index cf0de3f8dc..2bc938509f 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -74,7 +74,8 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} - test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]' + test-cfg: '[{"pg_version":"v17"}]' + sanitizers: enabled secrets: inherit diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml new file mode 100644 index 0000000000..71c5158ef6 --- /dev/null +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -0,0 +1,76 @@ +name: Force Test Upgrading of Extension +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '45 2 * * *' # run once a day, timezone is utc + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow + group: ${{ github.workflow }} + cancel-in-progress: true + +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read + +jobs: + regress: + strategy: + fail-fast: false + matrix: + pg-version: [16, 17] + + runs-on: small + + steps: + - uses: actions/checkout@v4 + with: + submodules: false + + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + + - name: Test extension upgrade + timeout-minutes: 20 + env: + NEWTAG: latest + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + PG_VERSION: ${{ matrix.pg-version }} + FORCE_ALL_UPGRADE_TESTS: true + run: ./docker-compose/test_extensions_upgrade.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Post to the Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} + slack-message: | + Test upgrading of extensions: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml new file mode 100644 index 0000000000..1e9d2ec5e2 --- /dev/null +++ b/.github/workflows/regenerate-pg-setting.yml @@ -0,0 +1,41 @@ +name: Regenerate Postgres Settings + +on: + pull_request: + types: + - opened + - synchronize + - reopened + paths: + - pgxn/neon/**.c + - vendor/postgres-v* + - vendor/revisions.json + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +permissions: + pull-requests: write + +jobs: + regenerate-pg-settings: + runs-on: ubuntu-22.04 + + steps: + - name: Add comment + uses: thollander/actions-comment-pull-request@v3 + with: + comment-tag: ${{ github.job }} + pr-number: ${{ github.event.number }} + message: | + If this PR added a GUC in the Postgres fork or `neon` extension, + please regenerate the Postgres settings in the `cloud` repo: + + ``` + make NEON_WORKDIR=path/to/neon/checkout \ + -C goapp/internal/shareddomain/postgres generate + ``` + + If you're an external contributor, a Neon employee will assist in + making sure this step is done. diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 31696248b0..be6a7a7901 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -15,7 +15,14 @@ env: E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + cancel-previous-e2e-tests: + needs: [ check-permissions ] if: github.event_name == 'pull_request' runs-on: ubuntu-22.04 @@ -29,6 +36,7 @@ jobs: --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" tag: + needs: [ check-permissions ] runs-on: ubuntu-22.04 outputs: build-tag: ${{ steps.build-tag.outputs.tag }} @@ -68,7 +76,7 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: Wait for `promote-images-dev` job to finish + - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely timeout-minutes: 60 run: | @@ -79,20 +87,20 @@ jobs: # For PRs we use the run id as the tag BUILD_AND_TEST_RUN_ID=${TAG} while true; do - conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion') - case "$conclusion" in - success) - break - ;; - failure | cancelled | skipped) - echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..." - exit 1 - ;; - *) - echo "The 'promote-images-dev' hasn't succeed yet. Waiting..." - sleep 60 - ;; - esac + gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json + if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then + break + fi + jq -c '.[]' jobs.json | while read -r job; do + case $(echo $job | jq .conclusion) in + failure | cancelled | skipped) + echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..." + exit 1 + ;; + esac + done + echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..." + sleep 60 done - name: Set e2e-platforms diff --git a/Cargo.lock b/Cargo.lock index 2c5b0a113f..12c12bc771 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -300,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.5.15" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -311,7 +311,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.60.7", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.4" +version = "1.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" +checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -368,15 +368,15 @@ dependencies = [ [[package]] name = "aws-sdk-iam" -version = "1.60.0" +version = "1.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a43daa438f8e7e4ebbbcb5c712b3b85db50d62e637a7da4ba9da51095d327460" +checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -391,15 +391,15 @@ dependencies = [ [[package]] name = "aws-sdk-kms" -version = "1.58.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b7a24700ac548025a47a5c579886f5198895bb1eccd8964dfd71cd66c16912" +checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -413,9 +413,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.68.0" +version = "1.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc" +checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e" dependencies = [ "aws-credential-types", "aws-runtime", @@ -424,7 +424,7 @@ dependencies = [ "aws-smithy-checksums", "aws-smithy-eventstream", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -447,15 +447,15 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.57.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38" +checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -469,15 +469,15 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.58.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c" +checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -491,15 +491,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.58.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962" +checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -514,9 +514,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.8" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837" +checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -543,9 +543,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -575,9 +575,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.6" +version = "0.60.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" dependencies = [ "aws-smithy-types", "bytes", @@ -586,9 +586,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.12" +version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -607,9 +607,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.2" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-json" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" dependencies = [ "aws-smithy-types", ] @@ -626,9 +635,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.7" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" +checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -670,9 +679,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.13" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ "base64-simd", "bytes", @@ -705,9 +714,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.5" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -777,7 +786,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-trait", "base64 0.22.1", @@ -806,7 +815,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-lock", "async-trait", @@ -825,7 +834,7 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "async-lock", @@ -843,7 +852,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "azure_core", @@ -863,7 +872,7 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "azure_core", "bytes", @@ -1020,12 +1029,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "boxcar" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42" - [[package]] name = "bstr" version = "1.5.0" @@ -1284,6 +1287,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "jsonwebtoken", "regex", "remote_storage", "serde", @@ -1299,6 +1303,7 @@ dependencies = [ "aws-config", "aws-sdk-kms", "aws-sdk-s3", + "aws-smithy-types", "axum", "base64 0.13.1", "bytes", @@ -1311,6 +1316,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "jsonwebtoken", "metrics", "nix 0.27.1", "notify", @@ -1346,6 +1352,7 @@ dependencies = [ "utils", "uuid", "vm_monitor", + "walkdir", "workspace_hack", "zstd", ] @@ -1424,6 +1431,7 @@ dependencies = [ "comfy-table", "compute_api", "futures", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -2748,6 +2756,38 @@ dependencies = [ "url", ] +[[package]] +name = "http-utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "backtrace", + "bytes", + "fail", + "flate2", + "hyper 0.14.30", + "inferno 0.12.0", + "itertools 0.10.5", + "jemalloc_pprof", + "metrics", + "once_cell", + "pprof", + "regex", + "routerify", + "serde", + "serde_json", + "serde_path_to_error", + "thiserror 1.0.69", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", + "utils", + "uuid", + "workspace_hack", +] + [[package]] name = "httparse" version = "1.8.0" @@ -4102,6 +4142,7 @@ dependencies = [ "futures", "hex", "hex-literal", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -4202,6 +4243,7 @@ dependencies = [ "anyhow", "bytes", "futures", + "http-utils", "pageserver_api", "postgres", "reqwest", @@ -4883,7 +4925,6 @@ dependencies = [ "aws-sdk-iam", "aws-sigv4", "base64 0.13.1", - "boxcar", "bstr", "bytes", "camino", @@ -4908,6 +4949,7 @@ dependencies = [ "hostname", "http 1.1.0", "http-body-util", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -4934,7 +4976,6 @@ dependencies = [ "postgres-protocol2", "postgres_backend", "pq_proto", - "prometheus", "rand 0.8.5", "rand_distr", "rcgen", @@ -4959,7 +5000,6 @@ dependencies = [ "smallvec", "smol_str", "socket2", - "strum", "strum_macros", "subtle", "thiserror 1.0.69", @@ -4974,7 +5014,6 @@ dependencies = [ "tracing", "tracing-log", "tracing-opentelemetry", - "tracing-serde", "tracing-subscriber", "tracing-utils", "try-lock", @@ -5755,6 +5794,7 @@ dependencies = [ "futures", "hex", "http 1.1.0", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", @@ -5819,6 +5859,7 @@ dependencies = [ name = "safekeeper_client" version = "0.1.0" dependencies = [ + "http-utils", "reqwest", "safekeeper_api", "serde", @@ -6401,6 +6442,7 @@ dependencies = [ "fail", "futures", "hex", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", @@ -6412,10 +6454,13 @@ dependencies = [ "pageserver_client", "postgres_connection", "rand 0.8.5", + "regex", "reqwest", "routerify", "rustls 0.23.18", "rustls-native-certs 0.8.0", + "safekeeper_api", + "safekeeper_client", "scoped-futures", "scopeguard", "serde", @@ -7565,48 +7610,38 @@ dependencies = [ "criterion", "diatomic-waker", "fail", - "flate2", "futures", "git-version", "hex", "hex-literal", "humantime", - "hyper 0.14.30", "inferno 0.12.0", - "itertools 0.10.5", - "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", - "pprof", "pq_proto", "rand 0.8.5", "regex", - "routerify", "scopeguard", "sentry", "serde", "serde_assert", "serde_json", - "serde_path_to_error", "serde_with", "signal-hook", "strum", "strum_macros", "thiserror 1.0.69", "tokio", - "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", "tracing", "tracing-error", "tracing-subscriber", - "url", - "uuid", "walkdir", ] @@ -8201,6 +8236,7 @@ dependencies = [ "tracing-core", "tracing-log", "url", + "uuid", "zerocopy", "zeroize", "zstd", diff --git a/Cargo.toml b/Cargo.toml index 76b54ae1d8..7228623c6b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ members = [ "storage_scrubber", "workspace_hack", "libs/compute_api", + "libs/http-utils", "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", @@ -229,6 +230,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } diff --git a/Dockerfile b/Dockerfile index 7ba54c8ca5..83ad86badb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,28 @@ ARG STABLE_PG_VERSION=16 ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot @@ -28,6 +50,14 @@ RUN set -e \ && rm -rf pg_install/build \ && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . +# Prepare cargo-chef recipe +FROM $REPOSITORY/$IMAGE:$TAG AS plan +WORKDIR /home/nonroot + +COPY --chown=nonroot . . + +RUN cargo chef prepare --recipe-path recipe.json + # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot @@ -41,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib +COPY --from=plan /home/nonroot/recipe.json recipe.json + +ARG ADDITIONAL_RUSTFLAGS="" + +RUN set -e \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json + COPY --chown=nonroot . . -ARG ADDITIONAL_RUSTFLAGS RUN set -e \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ @@ -59,7 +95,7 @@ RUN set -e \ # Build final image # -FROM debian:${DEBIAN_FLAVOR} +FROM $BASE_IMAGE_SHA ARG DEFAULT_PG_VERSION WORKDIR /data @@ -112,4 +148,3 @@ EXPOSE 6400 EXPOSE 9898 CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] - diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 52874d2ef6..317eded26e 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -1,6 +1,29 @@ ARG DEBIAN_VERSION=bookworm +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim -FROM debian:bookworm-slim AS pgcopydb_builder +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + +FROM $BASE_IMAGE_SHA AS pgcopydb_builder ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early @@ -9,7 +32,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch @@ -58,7 +81,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \ fi -FROM debian:${DEBIAN_VERSION}-slim AS build_tools +FROM $BASE_IMAGE_SHA AS build_tools ARG DEBIAN_VERSION # Add nonroot user @@ -75,7 +98,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc # System deps @@ -138,7 +161,8 @@ RUN curl -fsSL \ --output sql_exporter.tar.gz \ && mkdir /tmp/sql_exporter \ && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \ - && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter + && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \ + && rm sql_exporter.tar.gz # protobuf-compiler (protoc) ENV PROTOC_VERSION=25.1 @@ -276,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33 ARG CARGO_DENY_VERSION=0.16.2 ARG CARGO_HACK_VERSION=0.6.33 ARG CARGO_NEXTEST_VERSION=0.9.85 +ARG CARGO_CHEF_VERSION=0.1.71 ARG CARGO_DIESEL_CLI_VERSION=2.2.6 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ @@ -290,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ + cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \ cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \ --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 43910f2622..0491abe965 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -83,7 +83,28 @@ ARG TAG=pinned ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim -ARG ALPINE_CURL_VERSION=8.11.1 + +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} # By default, build all PostgreSQL extensions. For quick local testing when you don't # care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal @@ -94,7 +115,7 @@ ARG EXTENSIONS=all # Layer "build-deps" # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS build-deps +FROM $BASE_IMAGE_SHA AS build-deps ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early @@ -103,7 +124,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc RUN case $DEBIAN_VERSION in \ @@ -127,7 +148,7 @@ RUN case $DEBIAN_VERSION in \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* @@ -139,11 +160,11 @@ RUN case $DEBIAN_VERSION in \ ######################################################################################### FROM build-deps AS pg-build ARG PG_VERSION -COPY vendor/postgres-${PG_VERSION} postgres +COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ - if [ "${PG_VERSION}" != "v14" ]; then \ + if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ fi && \ @@ -237,7 +258,7 @@ RUN case "${DEBIAN_VERSION}" in \ # Postgis 3.5.0 supports v17 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export POSTGIS_VERSION=3.5.0 \ export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \ @@ -312,7 +333,7 @@ FROM build-deps AS pgrouting-src ARG DEBIAN_VERSION ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export PGROUTING_VERSION=3.6.2 \ export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \ @@ -358,7 +379,7 @@ COPY compute/patches/plv8-3.1.10.patch . # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ ;; \ @@ -372,7 +393,7 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi FROM pg-build AS plv8-build ARG PG_VERSION @@ -392,7 +413,7 @@ RUN \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - case "${PG_VERSION}" in \ + case "${PG_VERSION:?}" in \ "v17") \ ln -s plv8-3.2.3.so plv8-3.1.8.so && \ ln -s plv8-3.2.3.so plv8-3.1.5.so && \ @@ -729,7 +750,7 @@ FROM build-deps AS timescaledb-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -767,7 +788,7 @@ ARG PG_VERSION # version-specific, has separate releases for each version WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -843,7 +864,7 @@ ARG PG_VERSION # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ @@ -970,7 +991,7 @@ ARG PG_VERSION # # last release v0.40.0 - Jul 22, 2024 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \ @@ -1006,7 +1027,7 @@ ARG PG_VERSION # This is our extension, support stopped in favor of pgvector # TODO: deprecate it WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -1039,7 +1060,7 @@ ARG PG_VERSION # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. WORKDIR /ext-src -RUN case "${PG_VERSION}" in "v17") \ +RUN case "${PG_VERSION:?}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ @@ -1091,7 +1112,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux FROM pg-build-nonroot-with-cargo AS rust-extensions-build ARG PG_VERSION -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ 'v17') \ echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ esac && \ @@ -1270,7 +1291,7 @@ FROM build-deps AS pgx_ulid-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15" | "v16") \ ;; \ *) \ @@ -1302,7 +1323,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ ;; \ *) \ @@ -1430,8 +1451,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src -RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \ - echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ + echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh @@ -1443,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control +######################################################################################### +# +# Layer "pg-duckdb-pg-build" +# compile pg_duckdb extension +# +######################################################################################### +FROM build-deps AS pg_duckdb-src +WORKDIR /ext-src +COPY compute/patches/pg_duckdb_v031.patch . +# pg_duckdb build requires source dir to be a git repo to get submodules +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# - extension management function duckdb.install_extension() +# - access to duckdb.extensions table and its sequence +RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ + cd pg_duckdb-src && \ + git submodule update --init --recursive && \ + patch -p1 < /ext-src/pg_duckdb_v031.patch + +FROM pg-build AS pg_duckdb-build +ARG PG_VERSION +COPY --from=pg_duckdb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_duckdb-src +RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + ######################################################################################### # # Layer "pg_repack" @@ -1463,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install + +######################################################################################### +# +# Layer "pgaudit" +# compile pgaudit extension +# +######################################################################################### + +FROM build-deps AS pgaudit-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14") \ + export PGAUDIT_VERSION=1.6.2 \ + export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + ;; \ + "v15") \ + export PGAUDIT_VERSION=1.7.0 \ + export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + ;; \ + "v16") \ + export PGAUDIT_VERSION=16.0 \ + export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + ;; \ + "v17") \ + export PGAUDIT_VERSION=17.0 \ + export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + ;; \ + *) \ + echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ + echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgaudit-build +COPY --from=pgaudit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgaudit-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + +######################################################################################### +# +# Layer "pgauditlogtofile" +# compile pgauditlogtofile extension +# +######################################################################################### + +FROM build-deps AS pgauditlogtofile-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16" | "v17") \ + export PGAUDITLOGTOFILE_VERSION=v1.6.4 \ + export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \ + ;; \ + *) \ + echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \ + echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \ + mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgauditlogtofile-build +COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgauditlogtofile-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + ######################################################################################### # # Layer "neon-ext-build" @@ -1556,7 +1669,10 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ ######################################################################################### # @@ -1578,7 +1694,15 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy +RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ + --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ + --mount=type=cache,uid=1000,target=/home/nonroot/target \ + mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ + mkdir target-bin && \ + cp target/release-line-debug-size-lto/compute_ctl \ + target/release-line-debug-size-lto/fast_import \ + target/release-line-debug-size-lto/local_proxy \ + target-bin ######################################################################################### # @@ -1586,7 +1710,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS pgbouncer +FROM $BASE_IMAGE_SHA AS pgbouncer RUN set -e \ && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ @@ -1607,7 +1731,7 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= @@ -1616,13 +1740,12 @@ RUN set -e \ # Layer "exporters" # ######################################################################################### -FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters +FROM build-deps AS exporters ARG TARGETARCH # Keep sql_exporter version same as in build-tools.Dockerfile and # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` -RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \ - if [ "$TARGETARCH" = "amd64" ]; then\ +RUN if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ @@ -1673,7 +1796,7 @@ USER nonroot COPY --chown=nonroot compute compute -RUN make PG_VERSION="${PG_VERSION}" -C compute +RUN make PG_VERSION="${PG_VERSION:?}" -C compute ######################################################################################### # @@ -1699,15 +1822,15 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/ COPY --from=hypopg-src /ext-src/ /ext-src/ COPY --from=pg_hashids-src /ext-src/ /ext-src/ COPY --from=rum-src /ext-src/ /ext-src/ -#COPY --from=pgtap-src /ext-src/ /ext-src/ +COPY --from=pgtap-src /ext-src/ /ext-src/ COPY --from=ip4r-src /ext-src/ /ext-src/ COPY --from=prefix-src /ext-src/ /ext-src/ COPY --from=hll-src /ext-src/ /ext-src/ COPY --from=plpgsql_check-src /ext-src/ /ext-src/ #COPY --from=timescaledb-src /ext-src/ /ext-src/ COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ -COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch +COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch COPY --from=pg_cron-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ @@ -1736,51 +1859,12 @@ ENV PGDATABASE=postgres # Put it all together into the final image # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR +FROM $BASE_IMAGE_SHA ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] -# Add user postgres -RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ - echo "postgres:test_console_pass" | chpasswd && \ - mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ - mkdir /var/db/postgres/pgbouncer && \ - chown -R postgres:postgres /var/db/postgres && \ - chmod 0750 /var/db/postgres/compute && \ - chmod 0750 /var/db/postgres/pgbouncer && \ - echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \ - # create folder for file cache - mkdir -p -m 777 /neon/cache - -COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import - -# pgbouncer and its config -COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer -COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini - -# local_proxy and its config -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy -RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy - -# Metrics exporter binaries and configuration files -COPY --from=exporters ./postgres_exporter /bin/postgres_exporter -COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter -COPY --from=exporters ./sql_exporter /bin/sql_exporter - -COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml - -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml - -# Create remote extension download directory -RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions - # Install: # libreadline8 for psql # liblz4-1 for lz4 @@ -1790,10 +1874,9 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca # libzstd1 for zstd # libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl - +# libevent for pgbouncer RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc - RUN apt update && \ case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): @@ -1828,33 +1911,54 @@ RUN apt update && \ libxslt1.1 \ libzstd1 \ libcurl4 \ + libevent-2.1-7 \ locales \ procps \ ca-certificates \ - curl \ - unzip \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step) -ARG TARGETARCH -RUN set -ex; \ - if [ "${TARGETARCH}" = "amd64" ]; then \ - TARGETARCH_ALT="x86_64"; \ - CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ - elif [ "${TARGETARCH}" = "arm64" ]; then \ - TARGETARCH_ALT="aarch64"; \ - CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ - else \ - echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ - fi; \ - curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ - echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ - unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ - /tmp/awscliv2/aws/install; \ - rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \ - true +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + mkdir /var/db/postgres/pgbouncer && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + chmod 0750 /var/db/postgres/pgbouncer && \ + # create folder for file cache + mkdir -p -m 777 /neon/cache && \ + # Create remote extension download directory + mkdir /usr/local/download_extensions && \ + chown -R postgres:postgres /usr/local/download_extensions + +# pgbouncer and its config +COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer +COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini + +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import + +# local_proxy and its config +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy +RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy + +# Metrics exporter binaries and configuration files +COPY --from=exporters ./postgres_exporter /bin/postgres_exporter +COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter +COPY --from=exporters ./sql_exporter /bin/sql_exporter + +COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml + +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml + +# Make the libraries we built available +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig ENV LANG=en_US.utf8 USER postgres diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch new file mode 100644 index 0000000000..a7e188d69e --- /dev/null +++ b/compute/patches/pg_duckdb_v031.patch @@ -0,0 +1,11 @@ +diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql +index d777d76..af60106 100644 +--- a/sql/pg_duckdb--0.2.0--0.3.0.sql ++++ b/sql/pg_duckdb--0.2.0--0.3.0.sql +@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; + GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; ++GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser; ++GRANT ALL ON TABLE duckdb.extensions TO neon_superuser; ++GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser; diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch index 4039a036df..1fc3ffa609 100644 --- a/compute/patches/pg_hint_plan_v16.patch +++ b/compute/patches/pg_hint_plan_v16.patch @@ -6,16 +6,16 @@ index da723b8..5328114 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -35,7 +35,7 @@ index d372459..6282afe 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch index dbf4e470ea..3442a094eb 100644 --- a/compute/patches/pg_hint_plan_v17.patch +++ b/compute/patches/pg_hint_plan_v17.patch @@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 005143fff3..568f0b0444 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -47,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -72,8 +74,8 @@ build: | # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset # for debian version migration. - # - FROM debian:bookworm-slim as libcgroup-builder + ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 + FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 2fe50c3a45..6617c98599 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -47,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -68,7 +70,8 @@ build: | # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. - FROM debian:bullseye-slim as libcgroup-builder + ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b04f364cbb..81dcf99560 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -14,6 +14,7 @@ base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true +aws-smithy-types.workspace = true anyhow.workspace = true axum = { workspace = true, features = [] } camino.workspace = true @@ -24,6 +25,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -53,6 +55,7 @@ thiserror.workspace = true url.workspace = true uuid.workspace = true prometheus.workspace = true +walkdir.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 47fc9cb7fe..a8803ec793 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -41,19 +41,21 @@ use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; +use std::time::SystemTime; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Parser; use compute_tools::disk_quota::set_disk_quota; +use compute_tools::http::server::Server; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info, warn}; use url::Url; -use compute_api::responses::ComputeStatus; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; use compute_api::spec::ComputeSpec; use compute_tools::compute::{ @@ -61,7 +63,6 @@ use compute_tools::compute::{ }; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version_string; -use compute_tools::http::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result { } } +/// Generate a compute ID if one is not supplied. This exists to keep forward +/// compatibility tests working, but will be removed in a future iteration. +fn generate_compute_id() -> String { + let now = SystemTime::now(); + + format!( + "compute-{}", + now.duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + ) +} + #[derive(Parser)] #[command(rename_all = "kebab-case")] struct Cli { @@ -94,8 +108,20 @@ struct Cli { #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] pub remote_ext_config: Option, - #[arg(long, default_value_t = 3080)] - pub http_port: u16, + /// The port to bind the external listening HTTP server to. Clients running + /// outside the compute will talk to the compute through this port. Keep + /// the previous name for this argument around for a smoother release + /// with the control plane. + /// + /// TODO: Remove the alias after the control plane release which teaches the + /// control plane about the renamed argument. + #[arg(long, alias = "http-port", default_value_t = 3080)] + pub external_http_port: u16, + + /// The port to bind the internal listening HTTP server to. Clients like + /// the neon extension (for installing remote extensions) and local_proxy. + #[arg(long)] + pub internal_http_port: Option, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -130,17 +156,26 @@ struct Cli { #[arg(short = 'S', long, group = "spec-path")] pub spec_path: Option, - #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])] - pub compute_id: Option, + #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())] + pub compute_id: String, - #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")] + #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] pub control_plane_uri: Option, } fn main() -> Result<()> { let cli = Cli::parse(); - let build_tag = init()?; + // For historical reasons, the main thread that processes the spec and launches postgres + // is synchronous, but we always have this tokio runtime available and we "enter" it so + // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) + // from all parts of compute_ctl. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let _rt_guard = runtime.enter(); + + let build_tag = runtime.block_on(init())?; let scenario = failpoint_support::init(); @@ -172,8 +207,8 @@ fn main() -> Result<()> { deinit_and_exit(wait_pg_result); } -fn init() -> Result { - init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; +async fn init() -> Result { + init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { @@ -246,6 +281,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result { info!("got spec from cli argument {}", spec_json); return Ok(CliSpecParams { spec: Some(serde_json::from_str(spec_json)?), + compute_ctl_config: ComputeCtlConfig::default(), live_config_allowed: false, }); } @@ -255,26 +291,19 @@ fn try_spec_from_cli(cli: &Cli) -> Result { let file = File::open(Path::new(spec_path))?; return Ok(CliSpecParams { spec: Some(serde_json::from_reader(file)?), + compute_ctl_config: ComputeCtlConfig::default(), live_config_allowed: true, }); } - if cli.compute_id.is_none() { - panic!( - "compute spec should be provided by one of the following ways: \ - --spec OR --spec-path OR --control-plane-uri and --compute-id" - ); - }; if cli.control_plane_uri.is_none() { - panic!("must specify both --control-plane-uri and --compute-id or none"); + panic!("must specify --control-plane-uri"); }; - match get_spec_from_control_plane( - cli.control_plane_uri.as_ref().unwrap(), - cli.compute_id.as_ref().unwrap(), - ) { - Ok(spec) => Ok(CliSpecParams { - spec, + match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { + Ok(resp) => Ok(CliSpecParams { + spec: resp.0, + compute_ctl_config: resp.1, live_config_allowed: true, }), Err(e) => { @@ -291,6 +320,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result { struct CliSpecParams { /// If a spec was provided via CLI or file, the [`ComputeSpec`] spec: Option, + #[allow(dead_code)] + compute_ctl_config: ComputeCtlConfig, live_config_allowed: bool, } @@ -300,6 +331,7 @@ fn wait_spec( CliSpecParams { spec, live_config_allowed, + compute_ctl_config: _, }: CliSpecParams, ) -> Result> { let mut new_state = ComputeState::new(); @@ -319,13 +351,15 @@ fn wait_spec( let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) .context("cannot build tokio postgres config from connstr")?; let compute_node = ComputeNode { + compute_id: cli.compute_id.clone(), connstr, conn_conf, tokio_conn_conf, pgdata: cli.pgdata.clone(), pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), - http_port: cli.http_port, + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1), live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -343,10 +377,13 @@ fn wait_spec( compute.prewarm_postgres()?; } - // Launch http service first, so that we can serve control-plane requests - // while configuration is still in progress. - let _http_handle = - launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread"); + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + Server::External(cli.external_http_port).launch(&compute); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. @@ -484,21 +521,6 @@ fn start_postgres( use std::env; use tokio_util::sync::CancellationToken; - // Note: it seems like you can make a runtime in an inner scope and - // if you start a task in it it won't be dropped. However, make it - // in the outermost scope just to be safe. - let rt = if env::var_os("AUTOSCALING").is_some() { - Some( - tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .enable_all() - .build() - .expect("failed to create tokio runtime for monitor") - ) - } else { - None - }; - // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); @@ -509,16 +531,19 @@ fn start_postgres( Some(cli.filecache_connstr.clone()) }; - let vm_monitor = rt.as_ref().map(|rt| { - rt.spawn(vm_monitor::start( + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: Some(cli.cgroup.clone()), pgconnstr, addr: cli.vm_monitor_addr.clone(), })), token.clone(), - )) - }); + )); + Some(vm_monitor) + } else { + None + }; } } @@ -528,8 +553,6 @@ fn start_postgres( delay_exit, compute, #[cfg(target_os = "linux")] - rt, - #[cfg(target_os = "linux")] token, #[cfg(target_os = "linux")] vm_monitor, @@ -537,15 +560,13 @@ fn start_postgres( )) } -type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); +type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); struct StartPostgresResult { delay_exit: bool, // passed through from WaitSpecResult compute: Arc, - #[cfg(target_os = "linux")] - rt: Option, #[cfg(target_os = "linux")] token: tokio_util::sync::CancellationToken, #[cfg(target_os = "linux")] @@ -564,10 +585,10 @@ fn wait_postgres(pg: Option) -> Result { .expect("failed to start waiting on Postgres process"); PG_PID.store(0, Ordering::SeqCst); - // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() @@ -588,8 +609,6 @@ fn cleanup_after_postgres_exit( vm_monitor, #[cfg(target_os = "linux")] token, - #[cfg(target_os = "linux")] - rt, }: StartPostgresResult, ) -> Result { // Terminate the vm_monitor so it releases the file watcher on @@ -602,10 +621,6 @@ fn cleanup_after_postgres_exit( token.cancel(); // Kills the actual task running the monitor handle.abort(); - - // If handle is some, rt must have been used to produce it, and - // hence is also some - rt.unwrap().shutdown_timeout(Duration::from_secs(2)); } } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 1398f443dd..585f3e4e1d 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -25,10 +25,10 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::Context; +use anyhow::{bail, Context}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; -use clap::Parser; +use clap::{Parser, Subcommand}; use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; use nix::unistd::Pid; use tracing::{error, info, info_span, warn, Instrument}; @@ -44,22 +44,59 @@ mod s3_uri; const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); +#[derive(Subcommand, Debug)] +enum Command { + /// Runs local postgres (neon binary), restores into it, + /// uploads pgdata to s3 to be consumed by pageservers + Pgdata { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// If specified, will not shut down the local postgres after the import. Used in local testing + #[clap(short, long)] + interactive: bool, + /// Port to run postgres on. Default is 5432. + #[clap(long, default_value_t = 5432)] + pg_port: u16, // port to run postgres on, 5432 is default + + /// Number of CPUs in the system. This is used to configure # of + /// parallel worker processes, for index creation. + #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] + num_cpus: Option, + + /// Amount of RAM in the system. This is used to configure shared_buffers + /// and maintenance_work_mem. + #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] + memory_mb: Option, + }, + + /// Runs pg_dump-pg_restore from source to destination without running local postgres. + DumpRestore { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// Raw connection string to the destination database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + destination_connection_string: Option, + }, +} + #[derive(clap::Parser)] struct Args { - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_WORKDIR")] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] s3_prefix: Option, - #[clap(long)] - source_connection_string: Option, - #[clap(short, long)] - interactive: bool, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")] pg_bin_dir: Utf8PathBuf, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")] pg_lib_dir: Utf8PathBuf, - #[clap(long)] - pg_port: Option, // port to run postgres on, 5432 is default + + #[clap(subcommand)] + command: Command, } #[serde_with::serde_as] @@ -68,6 +105,8 @@ struct Spec { encryption_secret: EncryptionSecret, #[serde_as(as = "serde_with::base64::Base64")] source_connstring_ciphertext_base64: Vec, + #[serde_as(as = "Option")] + destination_connstring_ciphertext_base64: Option>, } #[derive(serde::Deserialize)] @@ -83,180 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { "C.UTF-8" }; -#[tokio::main] -pub(crate) async fn main() -> anyhow::Result<()> { - utils::logging::init( - utils::logging::LogFormat::Plain, - utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, - utils::logging::Output::Stdout, - )?; - - info!("starting"); - - let args = Args::parse(); - - // Validate arguments - if args.s3_prefix.is_none() && args.source_connection_string.is_none() { - anyhow::bail!("either s3_prefix or source_connection_string must be specified"); - } - if args.s3_prefix.is_some() && args.source_connection_string.is_some() { - anyhow::bail!("only one of s3_prefix or source_connection_string can be specified"); - } - - let working_directory = args.working_directory; - let pg_bin_dir = args.pg_bin_dir; - let pg_lib_dir = args.pg_lib_dir; - let pg_port = args.pg_port.unwrap_or_else(|| { - info!("pg_port not specified, using default 5432"); - 5432 - }); - - // Initialize AWS clients only if s3_prefix is specified - let (aws_config, kms_client) = if args.s3_prefix.is_some() { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let kms = aws_sdk_kms::Client::new(&config); - (Some(config), Some(kms)) - } else { - (None, None) - }; - - // Get source connection string either from S3 spec or direct argument - let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix { - let spec: Spec = { - let spec_key = s3_prefix.append("/spec.json"); - let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap()); - let object = s3_client - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? - }; - - match spec.encryption_secret { - EncryptionSecret::KMS { key_id } => { - let mut output = kms_client - .unwrap() - .decrypt() - .key_id(key_id) - .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( - spec.source_connstring_ciphertext_base64, - )) - .send() - .await - .context("decrypt source connection string")?; - let plaintext = output - .plaintext - .take() - .context("get plaintext source connection string")?; - String::from_utf8(plaintext.into_inner()) - .context("parse source connection string as utf8")? - } - } - } else { - args.source_connection_string.unwrap() - }; - - match tokio::fs::create_dir(&working_directory).await { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - if !is_directory_empty(&working_directory) - .await - .context("check if working directory is empty")? - { - anyhow::bail!("working directory is not empty"); - } else { - // ok - } - } - Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), - } - - let pgdata_dir = working_directory.join("pgdata"); - tokio::fs::create_dir(&pgdata_dir) +async fn decode_connstring( + kms_client: &aws_sdk_kms::Client, + key_id: &String, + connstring_ciphertext_base64: Vec, +) -> Result { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + connstring_ciphertext_base64, + )) + .send() .await - .context("create pgdata directory")?; + .context("decrypt connection string")?; - let pgbin = pg_bin_dir.join("postgres"); - let pg_version = match get_pg_version(pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; - let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded - postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { - superuser, - locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, - pg_version, - initdb_bin: pg_bin_dir.join("initdb").as_ref(), - library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. - pgdata: &pgdata_dir, - }) - .await - .context("initdb")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext connection string")?; - let nproc = num_cpus::get(); + String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8") +} - // - // Launch postgres process - // - let mut postgres_proc = tokio::process::Command::new(pgbin) - .arg("-D") - .arg(&pgdata_dir) - .args(["-p", &format!("{pg_port}")]) - .args(["-c", "wal_level=minimal"]) - .args(["-c", "shared_buffers=10GB"]) - .args(["-c", "max_wal_senders=0"]) - .args(["-c", "fsync=off"]) - .args(["-c", "full_page_writes=off"]) - .args(["-c", "synchronous_commit=off"]) - .args(["-c", "maintenance_work_mem=8388608"]) - .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) - .args(["-c", &format!("max_worker_processes={nproc}")]) - .args([ - "-c", - &format!( - "effective_io_concurrency={}", - if cfg!(target_os = "macos") { 0 } else { 100 } - ), - ]) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir) - .env( - "ASAN_OPTIONS", - std::env::var("ASAN_OPTIONS").unwrap_or_default(), +struct PostgresProcess { + pgdata_dir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pgbin: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + postgres_proc: Option, +} + +impl PostgresProcess { + fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self { + Self { + pgdata_dir, + pgbin: pg_bin_dir.join("postgres"), + pg_bin_dir, + pg_lib_dir, + postgres_proc: None, + } + } + + async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> { + tokio::fs::create_dir(&self.pgdata_dir) + .await + .context("create pgdata directory")?; + + let pg_version = match get_pg_version(self.pgbin.as_ref()) { + PostgresMajorVersion::V14 => 14, + PostgresMajorVersion::V15 => 15, + PostgresMajorVersion::V16 => 16, + PostgresMajorVersion::V17 => 17, + }; + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: initdb_user, + locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, + pg_version, + initdb_bin: self.pg_bin_dir.join("initdb").as_ref(), + library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &self.pgdata_dir, + }) + .await + .context("initdb") + } + + async fn start( + &mut self, + initdb_user: &str, + port: u16, + nproc: usize, + memory_mb: usize, + ) -> Result<&tokio::process::Child, anyhow::Error> { + self.prepare(initdb_user).await?; + + // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for + // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest + // available for misc other stuff that PostgreSQL uses memory for. + let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; + let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; + + // + // Launch postgres process + // + let mut proc = tokio::process::Command::new(&self.pgbin) + .arg("-D") + .arg(&self.pgdata_dir) + .args(["-p", &format!("{port}")]) + .args(["-c", "wal_level=minimal"]) + .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args([ + "-c", + &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), + ]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .env("LD_LIBRARY_PATH", &self.pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take()) + .instrument(info_span!("postgres")), + ); + + self.postgres_proc = Some(proc); + Ok(self.postgres_proc.as_ref().unwrap()) + } + + async fn shutdown(&mut self) -> Result<(), anyhow::Error> { + let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap(); + info!("shutdown postgres"); + nix::sys::signal::kill( + Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")), + nix::sys::signal::SIGTERM, ) - .env( - "UBSAN_OPTIONS", - std::env::var("UBSAN_OPTIONS").unwrap_or_default(), - ) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("spawn postgres")?; - - info!("spawned postgres, waiting for it to become ready"); - tokio::spawn( - child_stdio_to_log::relay_process_output( - postgres_proc.stdout.take(), - postgres_proc.stderr.take(), - ) - .instrument(info_span!("postgres")), - ); + .context("signal postgres to shut down")?; + proc.wait() + .await + .context("wait for postgres to shut down") + .map(|_| ()) + } +} +async fn wait_until_ready(connstring: String, create_dbname: String) { // Create neondb database in the running postgres - let restore_pg_connstring = - format!("host=localhost port={pg_port} user={superuser} dbname=postgres"); - let start_time = std::time::Instant::now(); loop { @@ -267,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { std::process::exit(1); } - match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await { + match tokio_postgres::connect( + &connstring.replace("dbname=neondb", "dbname=postgres"), + tokio_postgres::NoTls, + ) + .await + { Ok((client, connection)) => { // Spawn the connection handling task to maintain the connection tokio::spawn(async move { @@ -276,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> { } }); - match client.simple_query("CREATE DATABASE neondb;").await { + match client + .simple_query(format!("CREATE DATABASE {create_dbname};").as_str()) + .await + { Ok(_) => { - info!("created neondb database"); + info!("created {} database", create_dbname); break; } Err(e) => { @@ -302,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> { } } } +} - let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb"); - - let dumpdir = working_directory.join("dumpdir"); +async fn run_dump_restore( + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + source_connstring: String, + destination_connstring: String, +) -> Result<(), anyhow::Error> { + let dumpdir = workdir.join("dumpdir"); let common_args = [ // schema mapping (prob suffices to specify them on one side) @@ -334,10 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg("--no-sync") // POSITIONAL args // source db (db name included in connection string) - .arg(&source_connection_string) + .arg(&source_connstring) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -354,24 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_dump.wait().await.context("wait for pg_dump")?; info!(status=?st, "pg_dump exited"); if !st.success() { - warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + error!(status=%st, "pg_dump failed, restore will likely fail as well"); + bail!("pg_dump failed"); } } - // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: maybe do it in a streaming way, plenty of internal research done on this already // TODO: do the unlogged table trick - - info!("restore from working directory into vanilla postgres"); { let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) .args(&common_args) .arg("-d") - .arg(&restore_pg_connstring) + .arg(&destination_connstring) // POSITIONAL args .arg(&dumpdir) // how we run it .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -389,48 +427,259 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_restore.wait().await.context("wait for pg_restore")?; info!(status=?st, "pg_restore exited"); if !st.success() { - warn!(status=%st, "pg_restore failed, restore will likely fail as well"); - } - } - - // If interactive mode, wait for Ctrl+C - if args.interactive { - info!("Running in interactive mode. Press Ctrl+C to shut down."); - tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; - } - - info!("shutdown postgres"); - { - nix::sys::signal::kill( - Pid::from_raw( - i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), - ), - nix::sys::signal::SIGTERM, - ) - .context("signal postgres to shut down")?; - postgres_proc - .wait() - .await - .context("wait for postgres to shut down")?; - } - - // Only sync if s3_prefix was specified - if let Some(s3_prefix) = args.s3_prefix { - info!("upload pgdata"); - aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) - .await - .context("sync dump directory to destination")?; - - info!("write status"); - { - let status_dir = working_directory.join("status"); - std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("pgdata"); - std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) - .context("write status file")?; - aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) - .await - .context("sync status directory to destination")?; + error!(status=%st, "pg_restore failed, restore will likely fail as well"); + bail!("pg_restore failed"); + } + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +async fn cmd_pgdata( + s3_client: Option, + kms_client: Option, + maybe_s3_prefix: Option, + maybe_spec: Option, + source_connection_string: Option, + interactive: bool, + pg_port: u16, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + num_cpus: Option, + memory_mb: Option, +) -> Result<(), anyhow::Error> { + if maybe_spec.is_none() && source_connection_string.is_none() { + bail!("spec must be provided for pgdata command"); + } + if maybe_spec.is_some() && source_connection_string.is_some() { + bail!("only one of spec or source_connection_string can be provided"); + } + + let source_connection_string = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await? + } + } + } else { + source_connection_string.unwrap() + }; + + let superuser = "cloud_admin"; + let destination_connstring = format!( + "host=localhost port={} user={} dbname=neondb", + pg_port, superuser + ); + + let pgdata_dir = workdir.join("pgdata"); + let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); + let nproc = num_cpus.unwrap_or_else(num_cpus::get); + let memory_mb = memory_mb.unwrap_or(256); + proc.start(superuser, pg_port, nproc, memory_mb).await?; + wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await; + + run_dump_restore( + workdir.clone(), + pg_bin_dir, + pg_lib_dir, + source_connection_string, + destination_connstring, + ) + .await?; + + // If interactive mode, wait for Ctrl+C + if interactive { + info!("Running in interactive mode. Press Ctrl+C to shut down."); + tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; + } + + proc.shutdown().await?; + + // Only sync if s3_prefix was specified + if let Some(s3_prefix) = maybe_s3_prefix { + info!("upload pgdata"); + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + Utf8Path::new(&pgdata_dir), + &s3_prefix.append("/pgdata/"), + ) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = workdir.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("pgdata"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + &status_dir, + &s3_prefix.append("/status/"), + ) + .await + .context("sync status directory to destination")?; + } + } + + Ok(()) +} + +async fn cmd_dumprestore( + kms_client: Option, + maybe_spec: Option, + source_connection_string: Option, + destination_connection_string: Option, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, +) -> Result<(), anyhow::Error> { + let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let source = decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await?; + + let dest = if let Some(dest_ciphertext) = + spec.destination_connstring_ciphertext_base64 + { + decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) + .await? + } else { + bail!("destination connection string must be provided in spec for dump_restore command"); + }; + + (source, dest) + } + } + } else { + ( + source_connection_string.unwrap(), + if let Some(val) = destination_connection_string { + val + } else { + bail!("destination connection string must be provided for dump_restore command"); + }, + ) + }; + + run_dump_restore( + workdir, + pg_bin_dir, + pg_lib_dir, + source_connstring, + destination_connstring, + ) + .await +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Json, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let args = Args::parse(); + + // Initialize AWS clients only if s3_prefix is specified + let (s3_client, kms_client) = if args.s3_prefix.is_some() { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let s3_client = aws_sdk_s3::Client::new(&config); + let kms = aws_sdk_kms::Client::new(&config); + (Some(s3_client), Some(kms)) + } else { + (None, None) + }; + + let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { + let spec_key = s3_prefix.append("/spec.json"); + let object = s3_client + .as_ref() + .unwrap() + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + } else { + None + }; + + match tokio::fs::create_dir(&args.working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&args.working_directory) + .await + .context("check if working directory is empty")? + { + bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + match args.command { + Command::Pgdata { + source_connection_string, + interactive, + pg_port, + num_cpus, + memory_mb, + } => { + cmd_pgdata( + s3_client, + kms_client, + args.s3_prefix, + spec, + source_connection_string, + interactive, + pg_port, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + num_cpus, + memory_mb, + ) + .await?; + } + Command::DumpRestore { + source_connection_string, + destination_connection_string, + } => { + cmd_dumprestore( + kms_client, + spec, + source_connection_string, + destination_connection_string, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + ) + .await?; } } diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs index 5fa58c8f87..1be10b36d6 100644 --- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -1,24 +1,102 @@ -use anyhow::Context; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::task::JoinSet; +use walkdir::WalkDir; use super::s3_uri::S3Uri; -pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("aws"); - builder - .arg("s3") - .arg("sync") - .arg(local.as_str()) - .arg(remote.to_string()); - let st = builder - .spawn() - .context("spawn aws s3 sync")? - .wait() - .await - .context("wait for aws s3 sync")?; - if st.success() { - Ok(()) - } else { - Err(anyhow::anyhow!("aws s3 sync failed")) +use tracing::{info, warn}; + +const MAX_PARALLEL_UPLOADS: usize = 10; + +/// Upload all files from 'local' to 'remote' +pub(crate) async fn upload_dir_recursive( + s3_client: &aws_sdk_s3::Client, + local: &Utf8Path, + remote: &S3Uri, +) -> anyhow::Result<()> { + // Recursively scan directory + let mut dirwalker = WalkDir::new(local) + .into_iter() + .map(|entry| { + let entry = entry?; + let file_type = entry.file_type(); + let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf(); + Ok((file_type, path)) + }) + .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| { + match e { + Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)), + Ok((file_type, _path)) if file_type.is_dir() => { + // The WalkDir iterator will recurse into directories, but we don't want + // to do anything with directories as such. There's no concept of uploading + // an empty directory to S3. + None + } + Ok((file_type, path)) if file_type.is_symlink() => { + // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip. + warn!("cannot upload symlink ({})", path); + None + } + Ok((_file_type, path)) => { + // should not happen + warn!("directory entry has unexpected type ({})", path); + None + } + Err(e) => Some(Err(e)), + } + }); + + // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in + // parallel. + let mut joinset = JoinSet::new(); + loop { + // Could we upload more? + while joinset.len() < MAX_PARALLEL_UPLOADS { + if let Some(full_local_path) = dirwalker.next() { + let full_local_path = full_local_path?; + let relative_local_path = full_local_path + .strip_prefix(local) + .expect("all paths start from the walkdir root"); + let remote_path = remote.append(relative_local_path.as_str()); + info!( + "starting upload of {} to {}", + &full_local_path, &remote_path + ); + let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path); + joinset.spawn(upload_task); + } else { + info!("draining upload tasks"); + break; + } + } + + // Wait for an upload to complete + if let Some(res) = joinset.join_next().await { + let _ = res?; + } else { + // all done! + break; + } } + Ok(()) +} + +pub(crate) async fn upload_file( + s3_client: aws_sdk_s3::Client, + local_path: Utf8PathBuf, + remote: S3Uri, +) -> anyhow::Result<()> { + use aws_smithy_types::byte_stream::ByteStream; + let stream = ByteStream::from_path(&local_path).await?; + + let _result = s3_client + .put_object() + .bucket(remote.bucket) + .key(&remote.key) + .body(stream) + .send() + .await?; + info!("upload of {} to {} finished", &local_path, &remote.key); + + Ok(()) } diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 4a297cfacf..28b10ce21c 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -140,5 +140,34 @@ pub async fn get_database_schema( warn!("pg_dump stderr: {}", line) } }); - Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) + + #[allow(dead_code)] + struct SchemaStream { + // We keep a reference to the child process to ensure it stays alive + // while the stream is being consumed. When SchemaStream is dropped, + // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump + cmd: tokio::process::Child, + stream: S, + } + + impl Stream for SchemaStream + where + S: Stream> + Unpin, + { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx) + } + } + + let schema_stream = SchemaStream { + cmd, + stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))), + }; + + Ok(schema_stream) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index fd76e404c6..d323ea3dcd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -9,7 +9,6 @@ use std::str::FromStr; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::thread; use std::time::Duration; use std::time::Instant; @@ -59,6 +58,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0); /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { + /// The ID of the compute + pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, // We connect to Postgres from many different places, so build configs once @@ -81,8 +82,10 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's HTTP server listens on - pub http_port: u16, + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -546,11 +549,7 @@ impl ComputeNode { pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); - // Run actual work with new tokio runtime - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); + let rt = tokio::runtime::Handle::current(); let result = rt.block_on(self.check_safekeepers_synced_async(compute_state)); // Record runtime @@ -597,9 +596,9 @@ impl ComputeNode { SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); if !sync_output.status.success() { anyhow::bail!( @@ -634,7 +633,7 @@ impl ComputeNode { config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - self.http_port, + self.internal_http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -784,7 +783,7 @@ impl ComputeNode { pub fn start_postgres( &self, storage_auth_token: Option, - ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. @@ -800,7 +799,7 @@ impl ComputeNode { .expect("cannot start postgres process"); PG_PID.store(pg.id(), Ordering::SeqCst); - // Start a thread to collect logs from stderr. + // Start a task to collect logs from stderr. let stderr = pg.stderr.take().expect("stderr should be captured"); let logs_handle = handle_postgres_logs(stderr); @@ -809,20 +808,28 @@ impl ComputeNode { Ok((pg, logs_handle)) } - /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// Do post configuration of the already started Postgres. This function spawns a background task to /// configure the database after applying the compute spec. Currently, it upgrades the neon extension /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { - let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config")); - thread::spawn(move || { - let func = || { - let mut client = conf.connect(NoTls)?; + let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config")); + tokio::spawn(async move { + let res = async { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + handle_neon_extension_upgrade(&mut client) + .await .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) - }; - if let Err(err) = func() { + } + .await; + if let Err(err) = res { error!("error while post_apply_config: {err:#}"); } }); @@ -919,13 +926,10 @@ impl ComputeNode { conf: Arc, concurrency: usize, ) -> Result<()> { - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()?; - info!("Applying config with max {} concurrency", concurrency); debug!("Config: {:?}", spec); + let rt = tokio::runtime::Handle::current(); rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. let client = Self::get_maintenance_client(&conf).await?; @@ -1319,14 +1323,18 @@ impl ComputeNode { } // Run migrations separately to not hold up cold starts - thread::spawn(move || { - let conf = conf.as_ref().clone(); - let mut conf = postgres::config::Config::from(conf); + tokio::spawn(async move { + let mut conf = conf.as_ref().clone(); conf.application_name("compute_ctl:migrations"); - match conf.connect(NoTls) { - Ok(mut client) => { - if let Err(e) = handle_migrations(&mut client) { + match conf.connect(NoTls).await { + Ok((mut client, connection)) => { + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + if let Err(e) = handle_migrations(&mut client).await { error!("Failed to run migrations: {}", e); } } @@ -1363,16 +1371,11 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1382,41 +1385,42 @@ impl ComputeNode { if let Some(ref local_proxy) = spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = Some(thread::spawn(move || { + tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } - })); + }); } // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; - let max_concurrent_connections = spec.reconfigure_concurrency; + if !spec.skip_pg_catalog_updates { + let max_concurrent_connections = spec.reconfigure_concurrency; + // Temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are reconfiguring: + // creating new extensions, roles, etc. + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + self.pg_reload_conf()?; - // Temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are reconfiguring: - // creating new extensions, roles, etc. - config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { - self.pg_reload_conf()?; + if spec.mode == ComputeMode::Primary { + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + let conf = Arc::new(conf); - if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); - let conf = Arc::new(conf); + let spec = Arc::new(spec.clone()); - let spec = Arc::new(spec.clone()); + self.apply_spec_sql(spec, conf, max_concurrent_connections)?; + } - self.apply_spec_sql(spec, conf, max_concurrent_connections)?; - } - - Ok(()) - })?; + Ok(()) + })?; + } self.pg_reload_conf()?; @@ -1431,7 +1435,9 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + pub fn start_compute( + &self, + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -1446,16 +1452,11 @@ impl ComputeNode { if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1465,10 +1466,10 @@ impl ComputeNode { if let Some(local_proxy) = &pspec.spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = thread::spawn(move || { + let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } @@ -1487,7 +1488,8 @@ impl ComputeNode { extension_server::create_control_files(remote_extensions, &self.pgbin); let library_load_start_time = Utc::now(); - let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?; + let rt = tokio::runtime::Handle::current(); + let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; let library_load_time = Utc::now() .signed_duration_since(library_load_start_time) @@ -1542,7 +1544,7 @@ impl ComputeNode { self.post_apply_config()?; let conf = self.get_conn_conf(None); - thread::spawn(move || { + tokio::task::spawn_blocking(|| { let res = get_installed_extensions(conf); match res { Ok(extensions) => { @@ -1891,7 +1893,6 @@ LIMIT 100", Ok(ext_version) } - #[tokio::main] pub async fn prepare_preload_libraries( &self, spec: &ComputeSpec, diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index a2043529a1..d88f26ca20 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc) { pub fn launch_configurator(compute: &Arc) -> thread::JoinHandle<()> { let compute = Arc::clone(compute); + let runtime = tokio::runtime::Handle::current(); + thread::Builder::new() .name("compute-configurator".into()) .spawn(move || { + let _rt_guard = runtime.enter(); configurator_main_loop(&compute); info!("configurator thread is exited"); }) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index a596bea504..93eb6ef5b7 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode}; use serde::Serialize; use tracing::error; -pub use server::launch_http_server; - mod extract; mod routes; -mod server; +pub mod server; /// Convenience response builder for JSON responses struct JsonResponse; diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 2ec4511676..836417d784 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -1,7 +1,21 @@ use axum::response::{IntoResponse, Response}; use http::StatusCode; +use serde::{Deserialize, Serialize}; use tracing::info; -use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest}; +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} use crate::http::{extract::Json, JsonResponse}; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index e41ed9df2d..a523ecd96f 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,7 +1,7 @@ use std::{ + fmt::Display, net::{IpAddr, Ipv6Addr, SocketAddr}, sync::Arc, - thread, time::Duration, }; @@ -26,46 +26,65 @@ use super::routes::{ }; use crate::compute::ComputeNode; -async fn handle_404() -> Response { - StatusCode::NOT_FOUND.into_response() -} - const X_REQUEST_ID: &str = "x-request-id"; -/// This middleware function allows compute_ctl to generate its own request ID -/// if one isn't supplied. The control plane will always send one as a UUID. The -/// neon Postgres extension on the other hand does not send one. -async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { - let headers = request.headers_mut(); - - if headers.get(X_REQUEST_ID).is_none() { - headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); - } - - next.run(request).await +/// `compute_ctl` has two servers: internal and external. The internal server +/// binds to the loopback interface and handles communication from clients on +/// the compute. The external server is what receives communication from the +/// control plane, the metrics scraper, etc. We make the distinction because +/// certain routes in `compute_ctl` only need to be exposed to local processes +/// like Postgres via the neon extension and local_proxy. +#[derive(Clone, Copy, Debug)] +pub enum Server { + Internal(u16), + External(u16), } -/// Run the HTTP server and wait on it forever. -#[tokio::main] -async fn serve(port: u16, compute: Arc) { - let mut app = Router::new() - .route("/check_writability", post(check_writability::is_writable)) - .route("/configure", post(configure::configure)) - .route("/database_schema", get(database_schema::get_schema_dump)) - .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) - .route( - "/extension_server/{*filename}", - post(extension_server::download_extension), - ) - .route("/extensions", post(extensions::install_extension)) - .route("/grants", post(grants::add_grant)) - .route("/insights", get(insights::get_insights)) - .route("/metrics", get(metrics::get_metrics)) - .route("/metrics.json", get(metrics_json::get_metrics)) - .route("/status", get(status::get_status)) - .route("/terminate", post(terminate::terminate)) - .fallback(handle_404) - .layer( +impl Display for Server { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Server::Internal(_) => f.write_str("internal"), + Server::External(_) => f.write_str("external"), + } + } +} + +impl From for Router> { + fn from(server: Server) -> Self { + let mut router = Router::>::new(); + + router = match server { + Server::Internal(_) => { + router = router + .route( + "/extension_server/{*filename}", + post(extension_server::download_extension), + ) + .route("/extensions", post(extensions::install_extension)) + .route("/grants", post(grants::add_grant)); + + // Add in any testing support + if cfg!(feature = "testing") { + use super::routes::failpoints; + + router = router.route("/failpoints", post(failpoints::configure_failpoints)); + } + + router + } + Server::External(_) => router + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics", get(metrics::get_metrics)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)), + }; + + router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( ServiceBuilder::new() // Add this middleware since we assume the request ID exists .layer(middleware::from_fn(maybe_add_request_id_header)) @@ -105,45 +124,88 @@ async fn serve(port: u16, compute: Arc) { ) .layer(PropagateRequestIdLayer::x_request_id()), ) - .with_state(compute); + } +} - // Add in any testing support - if cfg!(feature = "testing") { - use super::routes::failpoints; - - app = app.route("/failpoints", post(failpoints::configure_failpoints)) +impl Server { + async fn handle_404() -> impl IntoResponse { + StatusCode::NOT_FOUND } - // This usually binds to both IPv4 and IPv6 on Linux, see - // https://github.com/rust-lang/rust/pull/34440 for more information - let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); - let listener = match TcpListener::bind(&addr).await { - Ok(listener) => listener, - Err(e) => { - error!( - "failed to bind the compute_ctl HTTP server to port {}: {}", - port, e - ); - return; + async fn handle_405() -> impl IntoResponse { + StatusCode::METHOD_NOT_ALLOWED + } + + async fn listener(&self) -> Result { + let addr = SocketAddr::new(self.ip(), self.port()); + let listener = TcpListener::bind(&addr).await?; + + Ok(listener) + } + + fn ip(&self) -> IpAddr { + match self { + // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners + // allow binding to localhost + Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), } - }; - - if let Ok(local_addr) = listener.local_addr() { - info!("compute_ctl HTTP server listening on {}", local_addr); - } else { - info!("compute_ctl HTTP server listening on port {}", port); } - if let Err(e) = axum::serve(listener, app).await { - error!("compute_ctl HTTP server error: {}", e); + fn port(self) -> u16 { + match self { + Server::Internal(port) => port, + Server::External(port) => port, + } + } + + async fn serve(self, compute: Arc) { + let listener = self.listener().await.unwrap_or_else(|e| { + // If we can't bind, the compute cannot operate correctly + panic!( + "failed to bind the compute_ctl {} HTTP server to {}: {}", + self, + SocketAddr::new(self.ip(), self.port()), + e + ); + }); + + if tracing::enabled!(tracing::Level::INFO) { + let local_addr = match listener.local_addr() { + Ok(local_addr) => local_addr, + Err(_) => SocketAddr::new(self.ip(), self.port()), + }; + + info!( + "compute_ctl {} HTTP server listening at {}", + self, local_addr + ); + } + + let router = Router::from(self).with_state(compute); + + if let Err(e) = axum::serve(listener, router).await { + error!("compute_ctl {} HTTP server error: {}", self, e); + } + } + + pub fn launch(self, compute: &Arc) { + let state = Arc::clone(compute); + + info!("Launching the {} server", self); + + tokio::spawn(self.serve(state)); } } -/// Launch a separate HTTP server thread and return its `JoinHandle`. -pub fn launch_http_server(port: u16, state: &Arc) -> Result> { - let state = Arc::clone(state); +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if headers.get(X_REQUEST_ID).is_none() { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } - Ok(thread::Builder::new() - .name("http-server".into()) - .spawn(move || serve(port, state))?) + next.run(request).await } diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 00be5c13f9..3749dfc844 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*; /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See /// `tracing-utils` package description. /// -pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { +pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); @@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl"); + let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; // Put it all together tracing_subscriber::registry() diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 7b7b042d84..c5e05822c0 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Result}; use fail::fail_point; -use postgres::{Client, Transaction}; +use tokio_postgres::{Client, Transaction}; use tracing::{error, info}; use crate::metrics::DB_MIGRATION_FAILED; @@ -21,10 +21,11 @@ impl<'m> MigrationRunner<'m> { } /// Get the current value neon_migration.migration_id - fn get_migration_id(&mut self) -> Result { + async fn get_migration_id(&mut self) -> Result { let row = self .client - .query_one("SELECT id FROM neon_migration.migration_id", &[])?; + .query_one("SELECT id FROM neon_migration.migration_id", &[]) + .await?; Ok(row.get::<&str, i64>("id")) } @@ -34,7 +35,7 @@ impl<'m> MigrationRunner<'m> { /// This function has a fail point called compute-migration, which can be /// used if you would like to fail the application of a series of migrations /// at some point. - fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> { + async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> { // We use this fail point in order to check that failing in the // middle of applying a series of migrations fails in an expected // manner @@ -59,31 +60,38 @@ impl<'m> MigrationRunner<'m> { "UPDATE neon_migration.migration_id SET id = $1", &[&migration_id], ) + .await .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?; Ok(()) } /// Prepare the migrations the target database for handling migrations - fn prepare_database(&mut self) -> Result<()> { + async fn prepare_database(&mut self) -> Result<()> { self.client - .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?; - self.client.simple_query( - "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", - )?; + .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") + .await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; self.client - .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?; + .simple_query( + "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", + ) + .await?; self.client - .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?; + .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin") + .await?; + self.client + .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC") + .await?; Ok(()) } /// Run an individual migration in a separate transaction block. - fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { + async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { let mut txn = client .transaction() + .await .with_context(|| format!("begin transaction for migration {migration_id}"))?; if migration.starts_with("-- SKIP") { @@ -92,35 +100,38 @@ impl<'m> MigrationRunner<'m> { // Even though we are skipping the migration, updating the // migration ID should help keep logic easy to understand when // trying to understand the state of a cluster. - Self::update_migration_id(&mut txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id).await?; } else { info!("Running migration id={}:\n{}\n", migration_id, migration); txn.simple_query(migration) + .await .with_context(|| format!("apply migration {migration_id}"))?; - Self::update_migration_id(&mut txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id).await?; } txn.commit() + .await .with_context(|| format!("commit transaction for migration {migration_id}"))?; Ok(()) } /// Run the configured set of migrations - pub fn run_migrations(mut self) -> Result<()> { + pub async fn run_migrations(mut self) -> Result<()> { self.prepare_database() + .await .context("prepare database to handle migrations")?; - let mut current_migration = self.get_migration_id()? as usize; + let mut current_migration = self.get_migration_id().await? as usize; while current_migration < self.migrations.len() { // The index lags the migration ID by 1, so the current migration // ID is also the next index let migration_id = (current_migration + 1) as i64; let migration = self.migrations[current_migration]; - match Self::run_migration(self.client, migration_id, migration) { + match Self::run_migration(self.client, migration_id, migration).await { Ok(_) => { info!("Finished migration id={}", migration_id); } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index e03b410699..86fcf99085 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; use std::str::FromStr; -use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; @@ -16,6 +15,7 @@ use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; use tokio::io::AsyncBufReadExt; +use tokio::task::JoinHandle; use tokio::time::timeout; use tokio_postgres; use tokio_postgres::NoTls; @@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result Ok(()) } -/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs +/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs /// and send them to the logger. In the future we may also want to add context to /// these logs. -pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> { - std::thread::spawn(move || { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to build tokio runtime"); - - let res = runtime.block_on(async move { - let stderr = tokio::process::ChildStderr::from_std(stderr)?; - handle_postgres_logs_async(stderr).await - }); - if let Err(e) = res { - tracing::error!("error while processing postgres logs: {}", e); - } +pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle> { + tokio::spawn(async move { + let stderr = tokio::process::ChildStderr::from_std(stderr)?; + handle_postgres_logs_async(stderr).await }) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 37d5d3a1a6..6f28bd9733 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,8 +1,8 @@ use anyhow::{anyhow, bail, Result}; -use postgres::Client; use reqwest::StatusCode; use std::fs::File; use std::path::Path; +use tokio_postgres::Client; use tracing::{error, info, instrument, warn}; use crate::config; @@ -11,7 +11,9 @@ use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; +use compute_api::responses::{ + ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, +}; use compute_api::spec::ComputeSpec; // Do control plane request and return response if any. In case of error it @@ -73,14 +75,13 @@ fn do_control_plane_request( pub fn get_spec_from_control_plane( base_uri: &str, compute_id: &str, -) -> Result> { +) -> Result<(Option, ComputeCtlConfig)> { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), }; let mut attempt = 1; - let mut spec: Result> = Ok(None); info!("getting spec from control plane: {}", cp_uri); @@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane( // - no spec for compute yet (Empty state) -> return Ok(None) // - got spec -> return Ok(Some(spec)) while attempt < 4 { - spec = match do_control_plane_request(&cp_uri, &jwt) { + let result = match do_control_plane_request(&cp_uri, &jwt) { Ok(spec_resp) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[ @@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane( ]) .inc(); match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)), ControlPlaneComputeStatus::Attached => { if let Some(spec) = spec_resp.spec { - Ok(Some(spec)) + Ok((Some(spec), spec_resp.compute_ctl_config)) } else { bail!("compute is attached, but spec is empty") } @@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane( } }; - if let Err(e) = &spec { + if let Err(e) = &result { error!("attempt {} to get spec failed with: {}", attempt, e); } else { - return spec; + return result; } attempt += 1; @@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane( } // All attempts failed, return error. - spec + Err(anyhow::anyhow!( + "Exhausted all attempts to retrieve the spec from the control plane" + )) } /// Check `pg_hba.conf` and update if needed to allow external connections. @@ -166,17 +169,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { +pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); - client.simple_query(query)?; + client.simple_query(query).await?; Ok(()) } #[instrument(skip_all)] -pub fn handle_migrations(client: &mut Client) -> Result<()> { +pub async fn handle_migrations(client: &mut Client) -> Result<()> { info!("handle migrations"); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -206,7 +209,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { ), ]; - MigrationRunner::new(client, &migrations).run_migrations()?; + MigrationRunner::new(client, &migrations) + .run_migrations() + .await?; Ok(()) } @@ -214,7 +219,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { /// Connect to the database as superuser and pre-create anon extension /// if it is present in shared_preload_libraries #[instrument(skip_all)] -pub fn handle_extension_anon( +pub async fn handle_extension_anon( spec: &ComputeSpec, db_owner: &str, db_client: &mut Client, @@ -227,7 +232,7 @@ pub fn handle_extension_anon( if !grants_only { // check if extension is already initialized using anon.is_initialized() let query = "SELECT anon.is_initialized()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if !rows.is_empty() { let is_initialized: bool = rows[0].get(0); @@ -249,7 +254,7 @@ pub fn handle_extension_anon( // Users cannot create it themselves, because superuser is required. let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; info!("creating anon extension with query: {}", query); - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon extension creation failed with error: {}", e); @@ -259,7 +264,7 @@ pub fn handle_extension_anon( // check that extension is installed query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - let rows = db_client.query(query, &[])?; + let rows = db_client.query(query, &[]).await?; if rows.is_empty() { error!("anon extension is not installed"); return Ok(()); @@ -268,7 +273,7 @@ pub fn handle_extension_anon( // Initialize anon extension // This also requires superuser privileges, so users cannot do it themselves. query = "SELECT anon.init()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon.init() failed with error: {}", e); @@ -279,7 +284,7 @@ pub fn handle_extension_anon( // check that extension is installed, if not bail early let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if rows.is_empty() { error!("anon extension is not installed"); @@ -294,12 +299,12 @@ pub fn handle_extension_anon( let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // Grant permissions to db_owner to use anon extension functions let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // This is needed, because some functions are defined as SECURITY DEFINER. // In Postgres SECURITY DEFINER functions are executed with the privileges @@ -314,16 +319,16 @@ pub fn handle_extension_anon( where nsp.nspname = 'anon';", db_owner); info!("change anon extension functions owner to db owner"); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // affects views as well let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; } } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index f718102847..162c49ec7c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -33,6 +33,7 @@ postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true +http-utils.workspace = true utils.workspace = true whoami.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ba67ffa2dd..02d793400a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs { lsn: Option, #[clap(long)] pg_port: Option, + #[clap(long, alias = "http-port")] + external_http_port: Option, #[clap(long)] - http_port: Option, + internal_http_port: Option, #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, @@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res tenant_id, timeline_id, args.pg_port, - args.http_port, + args.external_http_port, + args.internal_http_port, args.pg_version, mode, !args.update_catalog, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index bc86d09103..c3c8229c38 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,6 +37,8 @@ //! ``` //! use std::collections::BTreeMap; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; @@ -46,6 +48,8 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::Database; use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; @@ -73,7 +77,8 @@ pub struct EndpointConf { timeline_id: TimelineId, mode: ComputeMode, pg_port: u16, - http_port: u16, + external_http_port: u16, + internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, @@ -128,7 +133,7 @@ impl ComputeControlPlane { 1 + self .endpoints .values() - .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) + .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port())) .max() .unwrap_or(self.base_port) } @@ -140,18 +145,27 @@ impl ComputeControlPlane { tenant_id: TenantId, timeline_id: TimelineId, pg_port: Option, - http_port: Option, + external_http_port: Option, + internal_http_port: Option, pg_version: u32, mode: ComputeMode, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); - let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); + let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); + let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1); let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + internal_http_port, + ), env: self.env.clone(), timeline_id, mode, @@ -176,7 +190,8 @@ impl ComputeControlPlane { tenant_id, timeline_id, mode, - http_port, + external_http_port, + internal_http_port, pg_port, pg_version, skip_pg_catalog_updates, @@ -230,9 +245,10 @@ pub struct Endpoint { pub timeline_id: TimelineId, pub mode: ComputeMode, - // port and address of the Postgres server and `compute_ctl`'s HTTP API + // port and address of the Postgres server and `compute_ctl`'s HTTP APIs pub pg_address: SocketAddr, - pub http_address: SocketAddr, + pub external_http_address: SocketAddr, + pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. pg_version: u32, @@ -287,8 +303,15 @@ impl Endpoint { serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; Ok(Endpoint { - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + conf.external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + conf.internal_http_port, + ), endpoint_id, env: env.clone(), timeline_id: conf.timeline_id, @@ -650,24 +673,51 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - cmd.args(["--http-port", &self.http_address.port().to_string()]) - .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &conn_str]) - .args([ - "--spec-path", - self.endpoint_path().join("spec.json").to_str().unwrap(), - ]) - .args([ - "--pgbin", - self.env - .pg_bin_dir(self.pg_version)? - .join("postgres") - .to_str() - .unwrap(), - ]) - .stdin(std::process::Stdio::null()) - .stderr(logfile.try_clone()?) - .stdout(logfile); + //cmd.args([ + // "--external-http-port", + // &self.external_http_address.port().to_string(), + //]) + //.args([ + // "--internal-http-port", + // &self.internal_http_address.port().to_string(), + //]) + cmd.args([ + "--http-port", + &self.external_http_address.port().to_string(), + ]) + .args(["--pgdata", self.pgdata().to_str().unwrap()]) + .args(["--connstr", &conn_str]) + .args([ + "--spec-path", + self.endpoint_path().join("spec.json").to_str().unwrap(), + ]) + .args([ + "--pgbin", + self.env + .pg_bin_dir(self.pg_version)? + .join("postgres") + .to_str() + .unwrap(), + ]) + // TODO: It would be nice if we generated compute IDs with the same + // algorithm as the real control plane. + // + // TODO: Add this back when + // https://github.com/neondatabase/neon/pull/10747 is merged. + // + //.args([ + // "--compute-id", + // &format!( + // "compute-{}", + // SystemTime::now() + // .duration_since(UNIX_EPOCH) + // .unwrap() + // .as_secs() + // ), + //]) + .stdin(std::process::Stdio::null()) + .stderr(logfile.try_clone()?) + .stdout(logfile); if let Some(remote_ext_config) = remote_ext_config { cmd.args(["--remote-ext-config", remote_ext_config]); @@ -754,8 +804,8 @@ impl Endpoint { reqwest::Method::GET, format!( "http://{}:{}/status", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() ), ) .send() @@ -828,14 +878,17 @@ impl Endpoint { let response = client .post(format!( "http://{}:{}/configure", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .body(format!( - "{{\"spec\":{}}}", - serde_json::to_string_pretty(&spec)? - )) + .body( + serde_json::to_string(&ConfigurationRequest { + spec, + compute_ctl_config: ComputeCtlConfig::default(), + }) + .unwrap(), + ) .send() .await?; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index dd37bfc407..28d130d9e0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -357,6 +357,16 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("Failed to parse 'compaction_algorithm' json")?, + compaction_l0_first: settings + .remove("compaction_l0_first") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_first' as a bool")?, + compaction_l0_semaphore: settings + .remove("compaction_l0_semaphore") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_semaphore' as a bool")?, l0_flush_delay_threshold: settings .remove("l0_flush_delay_threshold") .map(|x| x.parse::()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f0c3722925..ce7751fb14 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,8 +17,10 @@ use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; + +use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; -use utils::{http::error::HttpErrorBody, id::NodeId}; +use utils::id::NodeId; use crate::{ background_process, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 9a2d30c861..0fadb9c5fe 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -838,7 +838,10 @@ impl StorageController { self.dispatch( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { node_id }), + Some(TenantShardMigrateRequest { + node_id, + migration_config: None, + }), ) .await } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 985fe6b3b1..3c574efc63 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -22,7 +22,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, @@ -239,6 +239,19 @@ enum Command { #[arg(long)] scheduling_policy: SkSchedulingPolicyArg, }, + /// Downloads any missing heatmap layers for all shard for a given timeline + DownloadHeatmapLayers { + /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified, + /// the operation is performed on all shards. When a sharded tenant ID is + /// specified, the operation is only performed on the specified shard. + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + timeline_id: TimelineId, + /// Optional: Maximum download concurrency (default is 16) + #[arg(long)] + concurrency: Option, + }, } #[derive(Parser)] @@ -609,7 +622,10 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { node_id: node }; + let req = TenantShardMigrateRequest { + node_id: node, + migration_config: None, + }; storcon_client .dispatch::( @@ -623,7 +639,10 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { node_id: node }; + let req = TenantShardMigrateRequest { + node_id: node, + migration_config: None, + }; storcon_client .dispatch::( @@ -1082,7 +1101,10 @@ async fn main() -> anyhow::Result<()> { .dispatch::( Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), - Some(TenantShardMigrateRequest { node_id: mv.to }), + Some(TenantShardMigrateRequest { + node_id: mv.to, + migration_config: None, + }), ) .await .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) @@ -1238,6 +1260,24 @@ async fn main() -> anyhow::Result<()> { String::from(scheduling_policy) ); } + Command::DownloadHeatmapLayers { + tenant_shard_id, + timeline_id, + concurrency, + } => { + let mut path = format!( + "/v1/tenant/{}/timeline/{}/download_heatmap_layers", + tenant_shard_id, timeline_id, + ); + + if let Some(c) = concurrency { + path = format!("{path}?concurrency={c}"); + } + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index c4ff86ab66..dd520d4986 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)" # We are running tests now rm -f testout.txt testout_contrib.txt - docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ + docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh index b7158d2340..efb8bfc184 100755 --- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh +++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh @@ -2,4 +2,4 @@ set -ex cd "$(dirname ${0})" patch -p1 ), @@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually). #### Schema `safekeepers` table mirroring current `nodes` should be added, except that for -`scheduling_policy` field (seems like `status` is a better name for it): it is enough -to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) -`decomissioned`. +`scheduling_policy`: it is enough to have at least in the beginning only 3 +fields: 1) `active` 2) `paused` (initially means only not assign new tlis there +3) `decomissioned` (node is removed). `timelines` table: ``` @@ -324,18 +324,24 @@ table! { timelines (tenant_id, timeline_id) { timeline_id -> Varchar, tenant_id -> Varchar, + start_lsn -> pg_lsn, generation -> Int4, sk_set -> Array, // list of safekeeper ids - new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf cplane_notified_generation -> Int4, + deleted_at -> Nullable, } } ``` +`start_lsn` is needed to create timeline on safekeepers properly, see below. We +might also want to add ancestor_timeline_id to preserve the hierarchy, but for +this RFC it is not needed. + #### API Node management is similar to pageserver: -1) POST `/control/v1/safekeepers` upserts safekeeper. +1) POST `/control/v1/safekeepers` inserts safekeeper. 2) GET `/control/v1/safekeepers` lists safekeepers. 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. @@ -345,25 +351,15 @@ Node management is similar to pageserver: Safekeeper deploy scripts should register safekeeper at storage_contorller as they currently do with cplane, under the same id. -Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` -would 1) choose initial set of safekeepers; 2) write to the db initial -`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in -case of conflict; 3) create timeline on the majority of safekeepers (already -created is ok). +Timeline creation/deletion will work through already existing POST and DELETE +`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they +succeed. See next section on the implementation details. -We don't want to block timeline creation when one safekeeper is down. Currently -this is solved by compute implicitly creating timeline on any safekeeper it is -connected to. This creates ugly timeline state on safekeeper when timeline is -created, but start LSN is not defined yet. It would be nice to remove this; to -do that, controller can in the background retry to create timeline on -safekeeper(s) which missed that during initial creation call. It can do that -through `pull_timeline` from majority so it doesn't need to remember -`parent_lsn` in its db. - -Timeline deletion removes the row from the db and forwards deletion to the -current configuration members. Without additional actions deletions might leak, -see below on this; initially let's ignore these, reporting to cplane success if -at least one safekeeper deleted the timeline (this will remove s3 data). +We don't want to block timeline creation/deletion when one safekeeper is down. +Currently this is crutched by compute implicitly creating timeline on any +safekeeper it is connected to. This creates ugly timeline state on safekeeper +when timeline is created, but start LSN is not defined yet. Next section +describes dealing with this. Tenant deletion repeats timeline deletion for all timelines. @@ -395,26 +391,6 @@ Similar call should be added for the tenant. It would be great to have some way of subscribing to the results (apart from looking at logs/metrics). -Migration is executed as described above. One subtlety is that (local) deletion on -source safekeeper might fail, which is not a problem if we are going to -decomission the node but leaves garbage otherwise. I'd propose in the first version -1) Don't attempt deletion at all if node status is `offline`. -2) If it failed, just issue warning. -And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and -remove garbage timelines for manual use. It will 1) list all timelines on the -safekeeper 2) compare each one against configuration storage: if timeline -doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can -be deleted under generation number if node is not member of current generation. - -Automating this is untrivial; we'd need to register all potential missing -deletions in the same transaction -which switches configurations. Similarly when timeline is fully deleted to -prevent cplane operation from blocking when some safekeeper is not available -deletion should be also registered. - -One more task pool should infinitely retry notifying control plane about changed -safekeeper sets. - 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return current in memory state of the timeline and pending `MigrationRequest`, if any. @@ -423,12 +399,153 @@ safekeeper sets. migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS (incrementing generation as always). +#### API implementation and reconciliation + +For timeline creation/deletion we want to preserve the basic assumption that +unreachable minority (1 sk of 3) doesn't block their completion, but eventually +we want to finish creation/deletion on nodes which missed it (unless they are +removed). Similarly for migration; it may and should finish even though excluded +members missed their exclusion. And of course e.g. such pending exclusion on +node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As +another example, if some node missed timeline creation it clearly must not block +migration from it. Hence it is natural to have per safekeeper background +reconciler which retries these ops until they succeed. There are 3 possible +operation types, and the type is defined by timeline state (membership +configuration and whether it is deleted) and safekeeper id: we may need to +create timeline on sk (node added), locally delete it (node excluded, somewhat +similar to detach) or globally delete it (timeline is deleted). + +Next, on storage controller restart in principle these pending operations can be +figured out by comparing safekeepers state against storcon state. But it seems +better to me to materialize them in the database; it is not expensive, avoids +these startup scans which themselves can fail etc and makes it very easy to see +outstanding work directly at the source of truth -- the db. So we can add table +`safekeeper_timeline_pending_ops` +``` +table! { + // timeline_id, sk_id is primary key + safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) { + sk_id -> int8, + tenant_id -> Varchar, + timeline_id -> Varchar, + generation -> Int4, + op_type -> Varchar, + } +} +``` + +`op_type` can be `include` (seed from peers and ensure generation is up to +date), `exclude` (remove locally) and `delete`. Field is actually not strictly +needed as it can be computed from current configuration, but gives more explicit +observability. + +`generation` is necessary there because after op is done reconciler must remove +it and not remove another row with higher gen which in theory might appear. + +Any insert of row should overwrite (remove) all rows with the same sk and +timeline id but lower `generation` as next op makes previous obsolete. Insertion +of `op_type` `delete` overwrites all rows. + +About `exclude`: rather than adding explicit safekeeper http endpoint, it is +reasonable to reuse membership switch endpoint: if safekeeper is not member +of the configuration it locally removes the timeline on the switch. In this case +404 should also be considered an 'ok' answer by the caller. + +So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops` +joined with timeline configuration to get current conf (with generation `n`) +for the safekeeper and does the jobs, infinitely retrying failures: +1) If node is member (`include`): + - Check if timeline exists on it, if not, call pull_timeline on it from + other members + - Call switch configuration to the current +2) If node is not member (`exclude`): + - Call switch configuration to the current, 404 is ok. +3) If timeline is deleted (`delete`), call delete. + +In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and +timeline with generation <= `n` if `op_type` is not `delete`. +In case 3 also remove `safekeeper_timeline_pending_ops` +entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline. + +Let's consider in details how APIs can be implemented from this angle. + +Timeline creation. It is assumed that cplane retries it until success, so all +actions must be idempotent. Now, a tricky point here is timeline start LSN. For +the initial (tenant creation) call cplane doesn't know it. However, setting +start_lsn on safekeepers during creation is a good thing -- it provides a +guarantee that walproposer can always find a common point in WAL histories of +safekeeper and its own, and so absense of it would be a clear sign of +corruption. The following sequence works: +1) Create timeline (or observe that it exists) on pageserver, + figuring out last_record_lsn in response. +2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the + db. Note that last_record_lsn returned on the previous step is movable as it + changes once ingestion starts, insert must not overwrite it (as well as other + fields like membership conf). On the contrary, start_lsn used in the next + step must be set to the value in the db. cplane_notified_generation can be set + to 1 (initial generation) in insert to avoid notifying cplane about initial + conf as cplane will receive it in timeline creation request anyway. +3) Issue timeline creation calls to at least majority of safekeepers. Using + majority here is not necessary but handy because it guarantees that any live + majority will have at least one sk with created timeline and so + reconciliation task can use pull_timeline shared with migration instead of + create timeline special init case. OFC if timeline is already exists call is + ignored. +4) For minority of safekeepers which could have missed creation insert + entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion + because response to cplane is sent only after it has happened, and cplane + retries the call until 200 response. + + There is a small question how request handler (timeline creation in this + case) would interact with per sk reconciler. As always I prefer to do the + simplest possible thing and here it seems to be just waking it up so it + re-reads the db for work to do. Passing work in memory is faster, but + that shouldn't matter, and path to scan db for work will exist anyway, + simpler to reuse it. + +For pg version / wal segment size: while we may persist them in `timelines` +table, it is not necessary as initial creation at step 3 can take them from +pageserver or cplane creation call and later pull_timeline will carry them +around. + +Timeline migration. +1) CAS to the db to create joint conf, and in the same transaction create + `safekeeper_timeline_pending_ops` `include` entries to initialize new members + as well as deliver this conf to current ones; poke per sk reconcilers to work + on it. Also any conf change should also poke cplane notifier task(s). +2) Once it becomes possible per alg description above, get out of joint conf + with another CAS. Task should get wakeups from per sk reconcilers because + conf switch is required for advancement; however retries should be sleep + based as well as LSN advancement might be needed, though in happy path + it isn't. To see whether further transition is possible on wakup migration + executor polls safekeepers per the algorithm. CAS creating new conf with only + new members should again insert entries to `safekeeper_timeline_pending_ops` + to switch them there, as well as `exclude` rows to remove timeline from + old members. + +Timeline deletion: just set `deleted_at` on the timeline row and insert +`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by +per sk reconcilers. + +When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops` +for it must be cleared in the same transaction. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets (trying making `cplane_notified_generation` equal `generation`). + #### Dealing with multiple instances of storage_controller Operations described above executed concurrently might create some errors but do not prevent progress, so while we normally don't want to run multiple instances of storage_controller it is fine to have it temporarily, e.g. during redeploy. +To harden against some controller instance creating some work in +`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up +the job per sk reconcilers apart from explicit wakups should scan for work +periodically. It is possible to remove that though if all db updates are +protected with leadership token/term -- then such scans are needed only after +leadership is acquired. + Any interactions with db update in-memory controller state, e.g. if migration request failed because different one is in progress, controller remembers that and tries to finish it. @@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed. We should use Compute <-> safekeeper protocol change to include other (long yearned) modifications: -- send data in network order to make arm work. +- send data in network order without putting whole structs to be arch independent - remove term_start_lsn from AppendRequest - add horizon to TermHistory - add to ProposerGreeting number of connection from this wp to sk diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c0ec40a6c2..c11a1b6688 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true +jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index fc3757d981..0c256cae2e 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,18 +1,20 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. use crate::{ privilege::Privilege, + responses::ComputeCtlConfig, spec::{ComputeSpec, ExtVersion, PgIdent}, }; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can /// extend it and something like `restart: bool` or something else. So put /// `spec` into a struct initially to be more flexible in the future. -#[derive(Deserialize, Debug)] +#[derive(Debug, Deserialize, Serialize)] pub struct ConfigurationRequest { pub spec: ComputeSpec, + pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Debug)] diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 5286e0e61d..a6248019d9 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -3,6 +3,7 @@ use std::fmt::Display; use chrono::{DateTime, Utc}; +use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; use crate::{ @@ -135,13 +136,27 @@ pub struct CatalogObjects { pub databases: Vec, } +#[derive(Debug, Deserialize, Serialize)] +pub struct ComputeCtlConfig { + pub jwks: JwkSet, +} + +impl Default for ComputeCtlConfig { + fn default() -> Self { + Self { + jwks: JwkSet { + keys: Vec::default(), + }, + } + } +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. -/// This is not actually a compute API response, so consider moving -/// to a different place. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, + pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml new file mode 100644 index 0000000000..d72e4bd012 --- /dev/null +++ b/libs/http-utils/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "http-utils" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +backtrace.workspace = true +bytes.workspace = true +inferno.workspace = true +fail.workspace = true +flate2.workspace = true +hyper0.workspace = true +itertools.workspace = true +jemalloc_pprof.workspace = true +once_cell.workspace = true +pprof.workspace = true +regex.workspace = true +routerify.workspace = true +serde.workspace = true +serde_json.workspace = true +serde_path_to_error.workspace = true +thiserror.workspace = true +tracing.workspace = true +tokio.workspace = true +tokio-util.workspace = true +url.workspace = true +uuid.workspace = true + +# to use tokio channels as streams, this is faster to compile than async_stream +# why is it only here? no other crate should use it, streams are rarely needed. +tokio-stream = { version = "0.1.14" } + +metrics.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/src/http/endpoint.rs b/libs/http-utils/src/endpoint.rs similarity index 99% rename from libs/utils/src/http/endpoint.rs rename to libs/http-utils/src/endpoint.rs index 9f38373ca0..be97b341d1 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,7 +1,6 @@ -use crate::auth::{AuthError, Claims, SwappableJwtAuth}; -use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use crate::http::request::{get_query_param, parse_query_param}; +use crate::error::{api_error_handler, route_error_handler, ApiError}; use crate::pprof; +use crate::request::{get_query_param, parse_query_param}; use ::pprof::protos::Message as _; use ::pprof::ProfilerGuardBuilder; use anyhow::{anyhow, Context}; @@ -19,6 +18,7 @@ use tokio::sync::{mpsc, Mutex, Notify}; use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use std::future::Future; use std::io::Write as _; @@ -718,9 +718,9 @@ pub fn check_permission_with( #[cfg(test)] mod tests { use super::*; - use futures::future::poll_fn; use hyper::service::Service; use routerify::RequestServiceBuilder; + use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; #[tokio::test] diff --git a/libs/utils/src/http/error.rs b/libs/http-utils/src/error.rs similarity index 93% rename from libs/utils/src/http/error.rs rename to libs/http-utils/src/error.rs index 02fc9e3b99..746305caec 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/http-utils/src/error.rs @@ -5,6 +5,8 @@ use std::error::Error as StdError; use thiserror::Error; use tracing::{error, info, warn}; +use utils::auth::AuthError; + #[derive(Debug, Error)] pub enum ApiError { #[error("Bad request: {0:#?}")] @@ -96,6 +98,15 @@ impl ApiError { } } +impl From for ApiError { + fn from(_value: AuthError) -> Self { + // Don't pass on the value of the AuthError as a precautionary measure. + // Being intentionally vague in public error communication hurts debugability + // but it is more secure. + ApiError::Forbidden("JWT authentication error".to_string()) + } +} + #[derive(Serialize, Deserialize)] pub struct HttpErrorBody { pub msg: String, diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs new file mode 100644 index 0000000000..8a1e0c8cf0 --- /dev/null +++ b/libs/http-utils/src/failpoints.rs @@ -0,0 +1,50 @@ +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; + +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because neon was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/http/json.rs b/libs/http-utils/src/json.rs similarity index 100% rename from libs/utils/src/http/json.rs rename to libs/http-utils/src/json.rs diff --git a/libs/utils/src/http/mod.rs b/libs/http-utils/src/lib.rs similarity index 82% rename from libs/utils/src/http/mod.rs rename to libs/http-utils/src/lib.rs index 74ed6bb5b2..ae6a27aaa8 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/http-utils/src/lib.rs @@ -1,8 +1,12 @@ pub mod endpoint; pub mod error; +pub mod failpoints; pub mod json; +pub mod pprof; pub mod request; +extern crate hyper0 as hyper; + /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/pprof.rs b/libs/http-utils/src/pprof.rs similarity index 100% rename from libs/utils/src/pprof.rs rename to libs/http-utils/src/pprof.rs diff --git a/libs/utils/src/http/request.rs b/libs/http-utils/src/request.rs similarity index 100% rename from libs/utils/src/http/request.rs rename to libs/http-utils/src/request.rs diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index a0b5feea94..e64052c73d 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -121,6 +121,7 @@ pub struct ConfigToml { pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, + pub enable_read_path_debugging: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -262,6 +263,11 @@ pub struct TenantConfigToml { /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + /// If true, compact down L0 across all tenant timelines before doing regular compaction. + pub compaction_l0_first: bool, + /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only + /// has an effect if `compaction_l0_first` is `true`. + pub compaction_l0_semaphore: bool, /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification @@ -345,7 +351,7 @@ pub struct TenantConfigToml { /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. - pub rel_size_v2_enabled: Option, + pub rel_size_v2_enabled: bool, // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. @@ -490,7 +496,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), - timeline_offloading: false, + timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, @@ -510,6 +516,11 @@ impl Default for ConfigToml { } else { GetVectoredConcurrentIo::SidecarTask }, + enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + Some(true) + } else { + None + }, } } } @@ -537,6 +548,8 @@ pub mod tenant_conf_defaults { // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; @@ -586,6 +599,8 @@ impl Default for TenantConfigToml { compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, + compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST, + compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, l0_flush_delay_threshold: None, l0_flush_stall_threshold: None, l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, @@ -616,9 +631,9 @@ impl Default for TenantConfigToml { image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - timeline_offloading: false, + timeline_offloading: true, wal_receiver_protocol_override: None, - rel_size_v2_enabled: None, + rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 78e080981a..42f6e47e63 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -182,6 +182,18 @@ pub struct TenantDescribeResponseShard { #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { pub node_id: NodeId, + #[serde(default)] + pub migration_config: Option, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MigrationConfig { + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_warmup_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_download_request_timeout: Option, } #[derive(Serialize, Clone, Debug)] diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index dbd45da314..b88a2e46a1 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,10 +1,12 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::Oid; use postgres_ffi::RepOriginId; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; +use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62; /// The key prefix of ReplOrigin keys. pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; +/// The key prefix of db directory keys. +pub const DB_DIR_KEY_PREFIX: u8 = 0x64; + +/// The key prefix of rel directory keys. +pub const REL_DIR_KEY_PREFIX: u8 = 0x65; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub enum RelDirExists { + Exists, + Removed, +} + +#[derive(Debug)] +pub struct DecodeError; + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid marker") + } +} + +impl std::error::Error for DecodeError {} + +impl RelDirExists { + /// The value of the rel directory keys that indicates the existence of a relation. + const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r"); + + pub fn encode(&self) -> Bytes { + match self { + Self::Exists => Self::REL_EXISTS_MARKER.clone(), + Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(), + } + } + + pub fn decode_option(data: Option>) -> Result { + match data { + Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists), + // Any other marker is invalid + Some(_) => Err(DecodeError), + None => Ok(Self::Removed), + } + } + + pub fn decode(data: impl AsRef<[u8]>) -> Result { + let data = data.as_ref(); + if data == Self::REL_EXISTS_MARKER { + Ok(Self::Exists) + } else if data == SPARSE_TOMBSTONE_MARKER { + Ok(Self::Removed) + } else { + Err(DecodeError) + } + } +} + +/// A tombstone in the sparse keyspace, which is an empty buffer. +pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b""); + /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX @@ -110,6 +170,24 @@ impl Key { } } + pub fn rel_dir_sparse_key_range() -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// This function checks more extensively what keys we can take on the write path. /// If a key beginning with 00 does not have a global/default tablespace OID, it /// will be rejected on the write path. @@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } +#[inline(always)] +pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: relnode, + field5: forknum, + field6: 1, + } +} + +pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + } // it's fine to exclude the last key b/c we only use field6 == 1 +} + #[inline(always)] pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { @@ -734,9 +842,9 @@ impl Key { self.field1 == RELATION_SIZE_PREFIX } - pub fn sparse_non_inherited_keyspace() -> Range { + pub const fn sparse_non_inherited_keyspace() -> Range { // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace - debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX); + const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX); Key { field1: AUX_KEY_PREFIX, field2: 0, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 19beb37ab3..dd7bea2916 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -464,6 +464,10 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_algorithm: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_first: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_semaphore: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_delay_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_stall_threshold: FieldPatch, @@ -529,6 +533,8 @@ pub struct TenantConfig { pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy pub compaction_algorithm: Option, + pub compaction_l0_first: Option, + pub compaction_l0_semaphore: Option, pub l0_flush_delay_threshold: Option, pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, @@ -567,6 +573,8 @@ impl TenantConfig { mut compaction_threshold, mut compaction_upper_limit, mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, mut l0_flush_wait_upload, @@ -606,6 +614,10 @@ impl TenantConfig { .compaction_upper_limit .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); patch .l0_flush_delay_threshold .apply(&mut l0_flush_delay_threshold); @@ -669,6 +681,8 @@ impl TenantConfig { compaction_threshold, compaction_upper_limit, compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, l0_flush_wait_upload, @@ -1066,8 +1080,7 @@ pub struct TenantInfo { /// Opaque explanation if gc is being blocked. /// - /// Only looked up for the individual tenant detail, not the listing. This is purely for - /// debugging, not included in openapi. + /// Only looked up for the individual tenant detail, not the listing. #[serde(skip_serializing_if = "Option::is_none")] pub gc_blocking: Option, } @@ -1122,7 +1135,26 @@ pub struct TimelineInfo { pub ancestor_lsn: Option, pub last_record_lsn: Lsn, pub prev_record_lsn: Option, + + /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. + /// TODO: remove once control plane no longer reads it. pub latest_gc_cutoff_lsn: Lsn, + + /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. + /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, + /// as it is easier to reason about. + #[serde(default)] + pub applied_gc_cutoff_lsn: Lsn, + + /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval. + /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest + /// LSN at which it is legal to create a branch or ephemeral endpoint. + /// + /// Note that holders of valid LSN leases may be able to create branches and read pages earlier + /// than this LSN, but new leases may not be taken out earlier than this LSN. + #[serde(default)] + pub min_readable_lsn: Lsn, + pub disk_consistent_lsn: Lsn, /// The LSN that we have succesfully uploaded to remote storage diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 8c024375c1..f74b229ac4 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -9,6 +9,8 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; +use std::os::fd::AsRawFd; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Poll}; @@ -268,6 +270,7 @@ impl MaybeWriteOnly { } pub struct PostgresBackend { + pub socket_fd: RawFd, framed: MaybeWriteOnly, pub state: ProtoState, @@ -293,9 +296,11 @@ impl PostgresBackend { tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; + let socket_fd = socket.as_raw_fd(); let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, @@ -307,6 +312,7 @@ impl PostgresBackend { impl PostgresBackend { pub fn new_from_io( + socket_fd: RawFd, socket: IO, peer_addr: SocketAddr, auth_type: AuthType, @@ -315,6 +321,7 @@ impl PostgresBackend { let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 9bbbd4c260..46151ab924 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, ToSql, Type}; use crate::{ - prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, + query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, + SimpleQueryMessage, Statement, Transaction, TransactionBuilder, }; use bytes::BytesMut; use fallible_iterator::FallibleIterator; @@ -54,18 +54,18 @@ impl Responses { } /// A cache of type info and prepared statements for fetching type info -/// (corresponding to the queries in the [prepare] module). +/// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] struct CachedTypeInfo { /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). typeinfo: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). typeinfo_composite: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or /// its fallback). typeinfo_enum: Option, @@ -190,26 +190,6 @@ impl Client { &self.inner } - /// Creates a new prepared statement. - /// - /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), - /// which are set when executed. Prepared statements can only be used with the connection that created them. - pub async fn prepare(&self, query: &str) -> Result { - self.prepare_typed(query, &[]).await - } - - /// Like `prepare`, but allows the types of query parameters to be explicitly specified. - /// - /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be - /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. - pub async fn prepare_typed( - &self, - query: &str, - parameter_types: &[Type], - ) -> Result { - prepare::prepare(&self.inner, query, parameter_types).await - } - /// Executes a statement, returning a vector of the resulting rows. /// /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list @@ -222,14 +202,11 @@ impl Client { /// # Panics /// /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( + pub async fn query( &self, - statement: &T, + statement: Statement, params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> - where - T: ?Sized + ToStatement, - { + ) -> Result, Error> { self.query_raw(statement, slice_iter(params)) .await? .try_collect() @@ -250,13 +227,15 @@ impl Client { /// Panics if the number of parameters provided does not match the number expected. /// /// [`query`]: #method.query - pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + pub async fn query_raw<'a, I>( + &self, + statement: Statement, + params: I, + ) -> Result where - T: ?Sized + ToStatement, I: IntoIterator, I::IntoIter: ExactSizeIterator, { - let statement = statement.__convert().into_statement(self).await?; query::query(&self.inner, statement, params).await } @@ -271,55 +250,6 @@ impl Client { query::query_txt(&self.inner, statement, params).await } - /// Executes a statement, returning the number of rows modified. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn execute( - &self, - statement: &T, - params: &[&(dyn ToSql + Sync)], - ) -> Result - where - T: ?Sized + ToStatement, - { - self.execute_raw(statement, slice_iter(params)).await - } - - /// The maximally flexible version of [`execute`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`execute`]: #method.execute - pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result - where - T: ?Sized + ToStatement, - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - let statement = statement.__convert().into_statement(self).await?; - query::execute(self.inner(), statement, params).await - } - /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 768213f8ed..042b5a675e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,7 +1,8 @@ +#![allow(async_fn_in_trait)] + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use async_trait::async_trait; use postgres_protocol2::Oid; mod private { @@ -11,7 +12,6 @@ mod private { /// A trait allowing abstraction over connections and transactions. /// /// This trait is "sealed", and cannot be implemented outside of this crate. -#[async_trait] pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed { impl private::Sealed for Client {} -#[async_trait] impl GenericClient for Client { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where @@ -39,14 +38,12 @@ impl GenericClient for Client { /// Query for type information async fn get_type(&self, oid: Oid) -> Result { - self.get_type(oid).await + crate::prepare::get_type(self.inner(), oid).await } } impl private::Sealed for Transaction<'_> {} -#[async_trait] -#[allow(clippy::needless_lifetimes)] impl GenericClient for Transaction<'_> { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 9155dd8279..7426279167 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; -pub use crate::to_statement::ToStatement; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; @@ -65,7 +64,6 @@ pub mod row; mod simple_query; mod statement; pub mod tls; -mod to_statement; mod transaction; mod transaction_builder; pub mod types; diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index da0c755c5b..58bbb26cbc 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,7 +1,6 @@ use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::error::SqlState; use crate::types::{Field, Kind, Oid, Type}; use crate::{query, slice_iter}; use crate::{Column, Error, Statement}; @@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use std::future::Future; use std::pin::Pin; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -// Range types weren't added until Postgres 9.2, so pg_range may not exist -const TYPEINFO_FALLBACK_QUERY: &str = "\ -SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid -FROM pg_catalog.pg_type t -INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid -WHERE t.oid = $1 -"; - const TYPEINFO_ENUM_QUERY: &str = "\ SELECT enumlabel FROM pg_catalog.pg_enum @@ -39,14 +29,6 @@ WHERE enumtypid = $1 ORDER BY enumsortorder "; -// Postgres 9.0 didn't have enumsortorder -const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY oid -"; - pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ SELECT attname, atttypid FROM pg_catalog.pg_attribute @@ -56,15 +38,13 @@ AND attnum > 0 ORDER BY attnum "; -static NEXT_ID: AtomicUsize = AtomicUsize::new(0); - pub async fn prepare( client: &Arc, + name: &'static str, query: &str, types: &[Type], ) -> Result { - let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); - let buf = encode(client, &name, query, types)?; + let buf = encode(client, name, query, types)?; let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; match responses.next().await? { @@ -105,10 +85,11 @@ pub async fn prepare( fn prepare_rec<'a>( client: &'a Arc, + name: &'static str, query: &'a str, types: &'a [Type], ) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, query, types)) + Box::pin(prepare(client, name, query, types)) } fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { @@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { - prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; client.set_typeinfo(&stmt); Ok(stmt) @@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { - prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo_enum"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; client.set_typeinfo_enum(&stmt); Ok(stmt) @@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc) -> Result( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if log_enabled!(Level::Debug) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let mut responses = start(client, buf).await?; - - let mut rows = 0; - loop { - match responses.next().await? { - Message::DataRow(_) => {} - Message::CommandComplete(body) => { - rows = body - .tag() - .map_err(Error::parse)? - .rsplit(' ') - .next() - .unwrap() - .parse() - .unwrap_or(0); - } - Message::EmptyQueryResponse => rows = 0, - Message::ReadyForQuery(_) => return Ok(rows), - _ => return Err(Error::unexpected_message()), - } - } -} - async fn start(client: &InnerClient, buf: Bytes) -> Result { let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 22e160fc05..591872fbc5 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -13,7 +13,7 @@ use std::{ struct StatementInner { client: Weak, - name: String, + name: &'static str, params: Vec, columns: Vec, } @@ -22,7 +22,7 @@ impl Drop for StatementInner { fn drop(&mut self) { if let Some(client) = self.client.upgrade() { let buf = client.with_buf(|buf| { - frontend::close(b'S', &self.name, buf).unwrap(); + frontend::close(b'S', self.name, buf).unwrap(); frontend::sync(buf); buf.split().freeze() }); @@ -40,7 +40,7 @@ pub struct Statement(Arc); impl Statement { pub(crate) fn new( inner: &Arc, - name: String, + name: &'static str, params: Vec, columns: Vec, ) -> Statement { @@ -55,14 +55,14 @@ impl Statement { pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { Statement(Arc::new(StatementInner { client: Weak::new(), - name: String::new(), + name: "", params, columns, })) } pub(crate) fn name(&self) -> &str { - &self.0.name + self.0.name } /// Returns the expected types of the statement's parameters. diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs deleted file mode 100644 index 7e12992728..0000000000 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ /dev/null @@ -1,57 +0,0 @@ -use crate::to_statement::private::{Sealed, ToStatementType}; -use crate::Statement; - -mod private { - use crate::{Client, Error, Statement}; - - pub trait Sealed {} - - pub enum ToStatementType<'a> { - Statement(&'a Statement), - Query(&'a str), - } - - impl ToStatementType<'_> { - pub async fn into_statement(self, client: &Client) -> Result { - match self { - ToStatementType::Statement(s) => Ok(s.clone()), - ToStatementType::Query(s) => client.prepare(s).await, - } - } - } -} - -/// A trait abstracting over prepared and unprepared statements. -/// -/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which -/// was prepared previously. -/// -/// This trait is "sealed" and cannot be implemented by anything outside this crate. -pub trait ToStatement: Sealed { - #[doc(hidden)] - fn __convert(&self) -> ToStatementType<'_>; -} - -impl ToStatement for Statement { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Statement(self) - } -} - -impl Sealed for Statement {} - -impl ToStatement for str { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for str {} - -impl ToStatement for String { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for String {} diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index a39fda526f..8b14a4f290 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -9,13 +9,43 @@ use anyhow::bail; use serde::{Deserialize, Serialize}; use utils::id::NodeId; -/// Number uniquely identifying safekeeper configuration. -/// Note: it is a part of sk control file. -pub type Generation = u32; /// 1 is the first valid generation, 0 is used as /// a placeholder before we fully migrate to generations. -pub const INVALID_GENERATION: Generation = 0; -pub const INITIAL_GENERATION: Generation = 1; +pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0); +pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1); + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +/// +/// Like tenant generations, but for safekeepers. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct SafekeeperGeneration(u32); + +impl SafekeeperGeneration { + pub const fn new(v: u32) -> Self { + Self(v) + } + + #[track_caller] + pub fn previous(&self) -> Option { + Some(Self(self.0.checked_sub(1)?)) + } + + #[track_caller] + pub fn next(&self) -> Self { + Self(self.0 + 1) + } + + pub fn into_inner(self) -> u32 { + self.0 + } +} + +impl Display for SafekeeperGeneration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} /// Membership is defined by ids so e.g. walproposer uses them to figure out /// quorums, but we also carry host and port to give wp idea where to connect. @@ -89,7 +119,7 @@ impl Display for MemberSet { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Configuration { /// Unique id. - pub generation: Generation, + pub generation: SafekeeperGeneration, /// Current members of the configuration. pub members: MemberSet, /// Some means it is a joint conf. diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 30418b0efd..41ccdaa428 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse { pub struct SafekeeperUtilization { pub timeline_count: u64, } + +/// pull_timeline request body. +#[derive(Debug, Deserialize, Serialize)] +pub struct PullTimelineRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PullTimelineResponse { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index edb451a02c..e9611a0f12 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -21,23 +21,17 @@ bytes.workspace = true camino.workspace = true chrono.workspace = true diatomic-waker.workspace = true -flate2.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -hyper0 = { workspace = true, features = ["full"] } inferno.workspace = true -itertools.workspace = true fail.workspace = true futures = { workspace = true } -jemalloc_pprof.workspace = true jsonwebtoken.workspace = true -nix.workspace = true +nix = {workspace = true, features = [ "ioctl" ] } once_cell.workspace = true pin-project-lite.workspace = true -pprof.workspace = true regex.workspace = true -routerify.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true @@ -54,8 +48,6 @@ rand.workspace = true scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true -url.workspace = true -uuid.workspace = true walkdir.workspace = true pq_proto.workspace = true @@ -64,12 +56,6 @@ metrics.workspace = true const_format.workspace = true -# to use tokio channels as streams, this is faster to compile than async_stream -# why is it only here? no other crate should use it, streams are rarely needed. -tokio-stream = { version = "0.1.14" } - -serde_path_to_error.workspace = true - [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index f7acc61ac1..4bfd0ab055 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -10,7 +10,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::{http::error::ApiError, id::TenantId}; +use crate::id::TenantId; /// Algorithm to use. We require EdDSA. const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; @@ -90,15 +90,6 @@ impl Display for AuthError { } } -impl From for ApiError { - fn from(_value: AuthError) -> Self { - // Don't pass on the value of the AuthError as a precautionary measure. - // Being intentionally vague in public error communication hurts debugability - // but it is more secure. - ApiError::Forbidden("JWT authentication error".to_string()) - } -} - pub struct JwtAuth { decoding_keys: Vec, validation: Validation, diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index 096c7e5854..e6503fe377 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::time::Duration; use futures::Future; use tokio_util::sync::CancellationToken; @@ -29,6 +30,11 @@ pub async fn exponential_backoff( } } +pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration { + let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds); + Duration::from_secs_f64(seconds) +} + pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 42b45eeea0..4d173d0726 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -286,6 +286,11 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] + struct NewTypeStruct(u32); + const NT1: NewTypeStruct = NewTypeStruct(414243); + const NT1_INNER: u32 = 414243; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, @@ -408,4 +413,42 @@ mod tests { let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn be_nt() { + use super::BeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("0006 5223"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn le_nt() { + use super::LeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("2352 0600"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } } diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 272c6ebb26..fc998ad9a9 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -1,13 +1,6 @@ //! Failpoint support code shared between pageserver and safekeepers. -use crate::http::{ - error::ApiError, - json::{json_request, json_response}, -}; -use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::*; /// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. @@ -184,45 +177,3 @@ fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } - -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - -/// Configure failpoints through http. -pub async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot manage failpoints because neon was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 1fb18e9e9a..9389a27bf3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -2,8 +2,6 @@ //! between other crates in this repository. #![deny(clippy::undocumented_unsafe_blocks)] -extern crate hyper0 as hyper; - pub mod backoff; /// `Lsn` type implements common tasks on Log Sequence Numbers @@ -33,9 +31,6 @@ pub mod shard; mod hex; pub use hex::Hex; -// http endpoint utils -pub mod http; - // definition of the Generation type for pageserver attachment APIs pub mod generation; @@ -96,10 +91,11 @@ pub mod circuit_breaker; pub mod try_rcu; -pub mod pprof; - pub mod guard_arc_swap; +#[cfg(target_os = "linux")] +pub mod linux_socket_ioctl; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs new file mode 100644 index 0000000000..5ae0e86af8 --- /dev/null +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -0,0 +1,35 @@ +//! Linux-specific socket ioctls. +//! +//! + +use std::{ + io, + mem::MaybeUninit, + os::{fd::RawFd, raw::c_int}, +}; + +use nix::libc::{FIONREAD, TIOCOUTQ}; + +unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { + let mut inq: MaybeUninit = MaybeUninit::uninit(); + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn inq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, FIONREAD) +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn outq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, TIOCOUTQ) +} diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 6352ea9f92..d98284f969 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -117,6 +117,10 @@ impl TenantShardId { ) } + pub fn range(&self) -> RangeInclusive { + RangeInclusive::new(*self, *self) + } + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 6e4eaa0efd..41ac3b69b8 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -79,6 +79,7 @@ pq_proto.workspace = true remote_storage.workspace = true storage_broker.workspace = true tenant_size_model.workspace = true +http-utils.workspace = true utils.workspace = true workspace_hack.workspace = true reqwest.workspace = true diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index f582d307a7..db77a395e0 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -11,6 +11,7 @@ testing = [ "pageserver_api/testing" ] pageserver_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } +http-utils.workspace = true utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 0359bfcd0b..bb0f64ca32 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,11 +1,12 @@ use std::{collections::HashMap, error::Error as _}; use bytes::Bytes; -use detach_ancestor::AncestorDetached; -use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; + +use detach_ancestor::AncestorDetached; +use http_utils::error::HttpErrorBody; +use pageserver_api::{models::*, shard::TenantShardId}; use utils::{ - http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -476,6 +477,26 @@ impl Client { self.request(Method::POST, &uri, ()).await.map(|_| ()) } + pub async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + )) + .expect("Cannot build URL"); + + if let Some(concurrency) = concurrency { + path.query_pairs_mut() + .append_pair("concurrency", &format!("{}", concurrency)); + } + + self.request(Method::POST, path, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a6087920fd..e03b1bbe96 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,7 +13,7 @@ use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::Key; +use pageserver_api::key::{rel_block_to_key, Key}; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; @@ -42,8 +42,8 @@ use utils::lsn::Lsn; pub enum BasebackupError { #[error("basebackup pageserver error {0:#}")] Server(#[from] anyhow::Error), - #[error("basebackup client error {0:#}")] - Client(#[source] io::Error), + #[error("basebackup client error {0:#} when {1}")] + Client(#[source] io::Error, &'static str), } /// Create basebackup with non-rel data in it. @@ -234,7 +234,7 @@ where self.ar .append(&header, self.buf.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "flush"))?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); @@ -273,9 +273,9 @@ where for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add directory to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball"))?; } // Send config files. @@ -286,13 +286,13 @@ where self.ar .append(&header, data) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?; } else { let header = new_tar_header(filepath, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?; } } if !lazy_slru_download { @@ -406,7 +406,7 @@ where self.ar .append(&header, &*content) .await - .context("could not add aux file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?; } if min_restart_lsn != Lsn::MAX { @@ -419,7 +419,7 @@ where self.ar .append(&header, &data[..]) .await - .context("could not add restart.lsn file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?; } for xid in self .timeline @@ -451,9 +451,9 @@ where let crc32 = crc32c::crc32c(&content); content.extend_from_slice(&crc32.to_le_bytes()); let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; - self.ar.append(&header, &*content).await.context( - "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", - )?; + self.ar.append(&header, &*content).await.map_err(|e| { + BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint") + })?; } fail_point!("basebackup-before-control-file", |_| { @@ -464,7 +464,10 @@ where // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await.map_err(BasebackupError::Client)?; + self.ar + .finish() + .await + .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?; debug!("all tarred up!"); Ok(()) } @@ -482,9 +485,9 @@ where let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?; return Ok(()); } @@ -498,13 +501,9 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn( - src, - blknum, - Version::Lsn(self.lsn), - self.ctx, - self.io_concurrency.clone(), - ) + // TODO: investigate using get_vectored for the entire startblk..endblk range. + // But this code path is not on the critical path for most basebackups (?). + .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); @@ -515,7 +514,7 @@ where self.ar .append(&header, segment_data.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?; seg += 1; startblk = endblk; @@ -566,7 +565,7 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?; info!("timeline.pg_version {}", self.timeline.pg_version); @@ -576,7 +575,7 @@ where self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?; } else { warn!("global/pg_filenode.map is missing"); } @@ -612,9 +611,9 @@ where let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -627,14 +626,14 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?; } }; Ok(()) @@ -663,7 +662,7 @@ where self.ar .append(&header, &buf[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?; Ok(()) } @@ -693,7 +692,7 @@ where zenith_signal.as_bytes(), ) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; let checkpoint_bytes = self .timeline @@ -718,7 +717,7 @@ where self.ar .append(&header, &pg_control_bytes[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -742,7 +741,7 @@ where self.ar .append(&header, &wal_seg[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5764728505..fa098e9364 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -592,7 +592,7 @@ fn start_pageserver( let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper0::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown({ diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ce480c70a0..c5368f6806 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -193,6 +193,10 @@ pub struct PageServerConf { pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, + + /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer + /// files read. + pub enable_read_path_debugging: bool, } /// Token for authentication to safekeepers @@ -355,6 +359,7 @@ impl PageServerConf { wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, + enable_read_path_debugging, } = config_toml; let mut conf = PageServerConf { @@ -440,6 +445,7 @@ impl PageServerConf { .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), + enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 1d508f5fe9..a2395b0dca 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -8,7 +8,6 @@ use std::time::Duration; use crate::controller_upcall_client::ControlPlaneGenerationsApi; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::virtual_file::MaybeFatalIo; @@ -463,45 +462,18 @@ impl DeletionQueueClient { /// /// The `current_generation` is the generation of this pageserver's current attachment. The /// generations in `layers` are the generations in which those layers were written. - pub(crate) async fn push_layers( + pub(crate) fn push_layers( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { - if current_generation.is_none() { - debug!("Enqueuing deletions in legacy mode, skipping queue"); + // None generations are not valid for attached tenants: they must always be attached in + // a known generation. None generations are still permitted for layers in the index because + // they may be historical. + assert!(!current_generation.is_none()); - let mut layer_paths = Vec::new(); - for (layer, meta) in layers { - layer_paths.push(remote_layer_path( - &tenant_shard_id.tenant_id, - &timeline_id, - meta.shard, - &layer, - meta.generation, - )); - } - self.push_immediate(layer_paths).await?; - return self.flush_immediate().await; - } - - self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers) - } - - /// When a Tenant has a generation, push_layers is always synchronous because - /// the ListValidator channel is an unbounded channel. - /// - /// This can be merged into push_layers when we remove the Generation-less mode - /// support (``) - pub(crate) fn push_layers_sync( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - current_generation: Generation, - layers: Vec<(LayerName, LayerFileMetadata)>, - ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted .inc_by(layers.len() as u64); @@ -957,14 +929,12 @@ mod test { // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) info!("Pushing"); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(layer_file_name_1.clone(), layer_metadata)].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(layer_file_name_1.clone(), layer_metadata)].to_vec(), + )?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); assert_local_files(&[], &deletion_prefix); @@ -1017,14 +987,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - stale_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + stale_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a stale generation: it should have failed validation tracing::debug!("Flushing..."); @@ -1032,14 +1000,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - latest_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + latest_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a fresh generation: it should have passed validation tracing::debug!("Flushing..."); @@ -1074,28 +1040,24 @@ mod test { // generation gets that treatment) let remote_layer_file_name_historical = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation.previous(), - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation.previous(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // Inject a deletion in the generation before generation_now: after restart, // this deletion should get executed, because we execute deletions in the // immediately previous generation on the same node. let remote_layer_file_name_previous = ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), + )?; client.flush().await?; assert_remote_files( @@ -1139,6 +1101,7 @@ pub(crate) mod mock { use tracing::info; use super::*; + use crate::tenant::remote_timeline_client::remote_layer_path; use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index ca44fbe6ae..738a783813 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -61,6 +61,7 @@ use crate::{ remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, + tasks::sleep_random, }, CancellableTask, DiskUsageEvictionTask, }; @@ -210,14 +211,8 @@ async fn disk_usage_eviction_task( info!("disk usage based eviction task finishing"); }; - use crate::tenant::tasks::random_init_delay; - { - if random_init_delay(task_config.period, &cancel) - .await - .is_err() - { - return; - } + if sleep_random(task_config.period, &cancel).await.is_err() { + return; } let mut iteration_no = 0; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4b976e7f6f..12252739fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -824,6 +824,38 @@ paths: schema: $ref: "#/components/schemas/TenantConfigResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: concurrency + description: Maximum number of concurrent downloads (capped at remote storage concurrency) + in: query + required: false + schema: + type: integer + post: + description: | + Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter + may be used to target all shards of a tenant when the unsharded form is used, or a specific + tenant shard with the sharded form. + responses: + "200": + description: Success + delete: + description: Stop any on-going background downloads of heatmap layers for the specified timeline. + responses: + "200": + description: Success + /v1/utilization: get: description: | @@ -882,6 +914,8 @@ components: properties: reason: type: string + gc_blocking: + type: string TenantCreateRequest: allOf: @@ -1080,9 +1114,15 @@ components: type: integer state: type: string + min_readable_lsn: + type: string + format: hex latest_gc_cutoff_lsn: type: string format: hex + applied_gc_cutoff_lsn: + type: string + format: hex SyntheticSizeResponse: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 94f7510a4a..329bf82bde 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -13,6 +13,12 @@ use enumset::EnumSet; use futures::future::join_all; use futures::StreamExt; use futures::TryFutureExt; +use http_utils::endpoint::{ + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::failpoints::failpoints_handler; +use http_utils::request::must_parse_query_param; +use http_utils::request::{get_request_param, must_get_query_param, parse_query_param}; use humantime::format_rfc3339; use hyper::header; use hyper::StatusCode; @@ -60,13 +66,6 @@ use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::auth::JwtAuth; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use utils::http::request::must_parse_query_param; -use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -104,6 +103,13 @@ use crate::tenant::OffloadedTimeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; +use http_utils::{ + endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, + error::{ApiError, HttpErrorBody}, + json::{json_request, json_request_maybe, json_response}, + request::parse_request_param, + RequestExt, RouterBuilder, +}; use pageserver_api::models::{ StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, TimelineInfo, @@ -111,13 +117,6 @@ use pageserver_api::models::{ use utils::{ auth::SwappableJwtAuth, generation::Generation, - http::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, - }, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -483,6 +482,11 @@ async fn build_timeline_info_common( let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let min_readable_lsn = std::cmp::max( + timeline.get_gc_cutoff_lsn(), + *timeline.get_applied_gc_cutoff_lsn(), + ); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -494,7 +498,12 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), + // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally + // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we + // actually trimmed data to), which can pass each other when PITR is changed. + latest_gc_cutoff_lsn: min_readable_lsn, + min_readable_lsn, + applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), current_logical_size_is_accurate: match current_logical_size.accuracy() { tenant::timeline::logical_size::Accuracy::Approximate => false, @@ -561,7 +570,7 @@ async fn reload_auth_validation_keys_handler( let key_path = config.auth_validation_public_key_path.as_ref().unwrap(); info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}"); - match JwtAuth::from_key_path(key_path) { + match utils::auth::JwtAuth::from_key_path(key_path) { Ok(new_auth) => { shared_auth.swap(new_auth); json_response(StatusCode::OK, ()) @@ -1454,6 +1463,59 @@ async fn timeline_layer_scan_disposable_keys( ) } +async fn timeline_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // Only used in the case where remote storage is not configured. + const DEFAULT_MAX_CONCURRENCY: usize = 100; + // A conservative default. + const DEFAULT_CONCURRENCY: usize = 16; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let desired_concurrency = + parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let max_concurrency = get_config(&request) + .remote_storage_config + .as_ref() + .map(|c| c.concurrency_limit()) + .unwrap_or(DEFAULT_MAX_CONCURRENCY); + let concurrency = std::cmp::min(max_concurrency, desired_concurrency); + + timeline.start_heatmap_layers_download(concurrency).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn timeline_shutdown_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + timeline.stop_and_drain_heatmap_layers_download().await; + + json_response(StatusCode::OK, ()) +} + async fn layer_download_handler( request: Request, _cancel: CancellationToken, @@ -2152,6 +2214,7 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; @@ -3616,6 +3679,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", |r| api_handler(r, layer_map_info_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_download_heatmap_layers_handler), + ) + .delete( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3b8612a3fa..e1c26b0684 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::num::NonZeroUsize; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; @@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", &["tenant_id", "shard_id", "timeline_id"], // Low resolution to reduce cardinality. - vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0], + vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); @@ -1366,10 +1367,7 @@ impl SmgrOpTimer { /// The first callers receives Some, subsequent ones None. /// /// See [`SmgrOpTimerState`] for more context. - pub(crate) fn observe_execution_end_flush_start( - &mut self, - at: Instant, - ) -> Option { + pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option { // NB: unlike the other observe_* methods, this one take()s. #[allow(clippy::question_mark)] // maintain similar code pattern. let Some(mut inner) = self.0.take() else { @@ -1403,7 +1401,6 @@ impl SmgrOpTimer { .. } = inner; Some(SmgrOpFlushInProgress { - flush_started_at: at, global_micros: global_flush_in_progress_micros, per_timeline_micros: per_timeline_flush_in_progress_micros, }) @@ -1419,7 +1416,6 @@ impl SmgrOpTimer { /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there, /// and remove this struct from the code base. pub(crate) struct SmgrOpFlushInProgress { - flush_started_at: Instant, global_micros: IntCounter, per_timeline_micros: IntCounter, } @@ -1438,32 +1434,72 @@ impl Drop for SmgrOpTimer { self.observe_throttle_start(now); self.observe_throttle_done(ThrottleResult::NotThrottled { end: now }); self.observe_execution_start(now); - self.observe_execution_end_flush_start(now); + let maybe_flush_timer = self.observe_execution_end(now); + drop(maybe_flush_timer); } } impl SmgrOpFlushInProgress { - pub(crate) async fn measure(mut self, mut fut: Fut) -> O + /// The caller must guarantee that `socket_fd`` outlives this function. + pub(crate) async fn measure( + self, + started_at: Instant, + mut fut: Fut, + socket_fd: RawFd, + ) -> O where Fut: std::future::Future, { let mut fut = std::pin::pin!(fut); - // Whenever observe_guard gets called, or dropped, - // it adds the time elapsed since its last call to metrics. - // Last call is tracked in `now`. + let mut logged = false; + let mut last_counter_increment_at = started_at; let mut observe_guard = scopeguard::guard( - || { + |is_timeout| { let now = Instant::now(); - let elapsed = now - self.flush_started_at; - self.global_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.per_timeline_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.flush_started_at = now; + + // Increment counter + { + let elapsed_since_last_observe = now - last_counter_increment_at; + self.global_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + self.per_timeline_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + last_counter_increment_at = now; + } + + // Log something on every timeout, and on completion but only if we hit a timeout. + if is_timeout || logged { + logged = true; + let elapsed_total = now - started_at; + let msg = if is_timeout { + "slow flush ongoing" + } else { + "slow flush completed or cancelled" + }; + + let (inq, outq) = { + // SAFETY: caller guarantees that `socket_fd` outlives this function. + #[cfg(target_os = "linux")] + unsafe { + ( + utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2), + utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2), + ) + } + #[cfg(not(target_os = "linux"))] + { + _ = socket_fd; // appease unused lint on macOS + (-1, -1) + } + }; + + let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64()); + tracing::info!(elapsed_total_secs, inq, outq, msg); + } }, |mut observe| { - observe(); + observe(false); }, ); @@ -1471,7 +1507,7 @@ impl SmgrOpFlushInProgress { match tokio::time::timeout(Duration::from_secs(10), &mut fut).await { Ok(v) => return v, Err(_timeout) => { - (*observe_guard)(); + (*observe_guard)(true); } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 24a350399d..0c8da6f2a8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -236,7 +237,7 @@ pub async fn libpq_listener_main( type ConnectionHandlerResult = anyhow::Result<()>; -#[instrument(skip_all, fields(peer_addr))] +#[instrument(skip_all, fields(peer_addr, application_name))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( conf: &'static PageServerConf, @@ -257,6 +258,8 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let socket_fd = socket.as_raw_fd(); + let peer_addr = socket.peer_addr().context("get peer address")?; tracing::Span::current().record("peer_addr", field::display(peer_addr)); @@ -305,7 +308,7 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -489,7 +492,6 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - set_tracing_field_shard_id(&timeline); Ok(timeline) } } @@ -774,11 +776,11 @@ impl PageServerHandler { let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelExists, @@ -793,11 +795,10 @@ impl PageServerHandler { } } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelSize, @@ -812,11 +813,10 @@ impl PageServerHandler { } } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetDbSize, @@ -831,11 +831,10 @@ impl PageServerHandler { } } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetSlruSegment, @@ -850,12 +849,20 @@ impl PageServerHandler { } } PagestreamFeMessage::GetPage(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn); + // avoid a somewhat costly Span::record() by constructing the entire span in one go. + macro_rules! mkspan { + (before shard routing) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + }}; + ($shard_id:expr) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + }}; + } macro_rules! respond_error { - ($error:expr) => {{ + ($span:expr, $error:expr) => {{ let error = BatchedFeMessage::RespondError { - span, + span: $span, error: BatchedPageStreamError { req: req.hdr, err: $error, @@ -868,27 +875,35 @@ impl PageServerHandler { let key = rel_block_to_key(req.rel, req.blkno); let shard = match timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) - .instrument(span.clone()) // sets `shard_id` field .await { Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return respond_error!(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into() - )); - } Err(e) => { - return respond_error!(e.into()); + let span = mkspan!(before shard routing); + match e { + GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return respond_error!( + span, + PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + ) + ); + } + e => { + return respond_error!(span, e.into()); + } + } } }; + let span = mkspan!(shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, @@ -902,7 +917,7 @@ impl PageServerHandler { &shard, req.hdr.request_lsn, req.hdr.not_modified_since, - &shard.get_latest_gc_cutoff_lsn(), + &shard.get_applied_gc_cutoff_lsn(), ctx, ) // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait @@ -910,7 +925,7 @@ impl PageServerHandler { { Ok(lsn) => lsn, Err(e) => { - return respond_error!(e); + return respond_error!(span, e); } }; BatchedFeMessage::GetPage { @@ -922,11 +937,10 @@ impl PageServerHandler { } #[cfg(feature = "testing")] PagestreamFeMessage::Test(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_test_request"); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) .await?; @@ -1063,7 +1077,7 @@ impl PageServerHandler { }; // invoke handler function - let (handler_results, span): ( + let (mut handler_results, span): ( Vec>, _, ) = match batch { @@ -1190,11 +1204,49 @@ impl PageServerHandler { } }; + // We purposefully don't count flush time into the smgr operation timer. + // + // The reason is that current compute client will not perform protocol processing + // if the postgres backend process is doing things other than `->smgr_read()`. + // This is especially the case for prefetch. + // + // If the compute doesn't read from the connection, eventually TCP will backpressure + // all the way into our flush call below. + // + // The timer's underlying metric is used for a storage-internal latency SLO and + // we don't want to include latency in it that we can't control. + // And as pointed out above, in this case, we don't control the time that flush will take. + // + // We put each response in the batch onto the wire in a separate pgb_writer.flush() + // call, which (all unmeasured) adds syscall overhead but reduces time to first byte + // and avoids building up a "giant" contiguous userspace buffer to hold the entire response. + // TODO: vectored socket IO would be great, but pgb_writer doesn't support that. + let flush_timers = { + let flushing_start_time = Instant::now(); + let mut flush_timers = Vec::with_capacity(handler_results.len()); + for handler_result in &mut handler_results { + let flush_timer = match handler_result { + Ok((_, timer)) => Some( + timer + .observe_execution_end(flushing_start_time) + .expect("we are the first caller"), + ), + Err(_) => { + // TODO: measure errors + None + } + }; + flush_timers.push(flush_timer); + } + assert_eq!(flush_timers.len(), handler_results.len()); + flush_timers + }; + // Map handler result to protocol behavior. // Some handler errors cause exit from pagestream protocol. // Other handler errors are sent back as an error message and we stay in pagestream protocol. - for handler_result in handler_results { - let (response_msg, timer) = match handler_result { + for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) { + let response_msg = match handler_result { Err(e) => match &e.err { PageStreamError::Shutdown => { // If we fail to fulfil a request during shutdown, which may be _because_ of @@ -1218,16 +1270,14 @@ impl PageServerHandler { span.in_scope(|| { error!("error reading relation or page version: {full:#}") }); - ( - PagestreamBeMessage::Error(PagestreamErrorResponse { - req: e.req, - message: e.err.to_string(), - }), - None, // TODO: measure errors - ) + + PagestreamBeMessage::Error(PagestreamErrorResponse { + req: e.req, + message: e.err.to_string(), + }) } }, - Ok((response_msg, timer)) => (response_msg, Some(timer)), + Ok((response_msg, _op_timer_already_observed)) => response_msg, }; // @@ -1238,31 +1288,16 @@ impl PageServerHandler { &response_msg.serialize(protocol_version), ))?; - // We purposefully don't count flush time into the timer. - // - // The reason is that current compute client will not perform protocol processing - // if the postgres backend process is doing things other than `->smgr_read()`. - // This is especially the case for prefetch. - // - // If the compute doesn't read from the connection, eventually TCP will backpressure - // all the way into our flush call below. - // - // The timer's underlying metric is used for a storage-internal latency SLO and - // we don't want to include latency in it that we can't control. - // And as pointed out above, in this case, we don't control the time that flush will take. - let flushing_timer = timer.map(|mut timer| { - timer - .observe_execution_end_flush_start(Instant::now()) - .expect("we are the first caller") - }); - // what we want to do + let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { - Some(flushing_timer) => { - futures::future::Either::Left(flushing_timer.measure(flush_fut)) - } + Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure( + Instant::now(), + flush_fut, + socket_fd, + )), None => futures::future::Either::Right(flush_fut), }; // do it while respecting cancellation @@ -1340,7 +1375,7 @@ impl PageServerHandler { .take() .expect("implementation error: timeline_handles should not be locked"); - let request_span = info_span!("request", shard_id = tracing::field::Empty); + let request_span = info_span!("request"); let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { PageServicePipeliningConfig::Pipelined(pipelining_config) => { self.handle_pagerequests_pipelined( @@ -1764,6 +1799,13 @@ impl PageServerHandler { .as_millis() .to_string() }); + + info!( + "acquired lease for {} until {}", + lsn, + valid_until_str.as_deref().unwrap_or("") + ); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( @@ -1781,7 +1823,7 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1808,7 +1850,7 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1835,7 +1877,7 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -1925,7 +1967,7 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, req.hdr.request_lsn, @@ -2021,7 +2063,8 @@ impl PageServerHandler { { fn map_basebackup_error(err: BasebackupError) -> QueryError { match err { - BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + // TODO: passthrough the error site to the final error message? + BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), } } @@ -2034,6 +2077,7 @@ impl PageServerHandler { .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + set_tracing_field_shard_id(&timeline); if timeline.is_archived() == Some(true) { // TODO after a grace period, turn this log line into a hard error @@ -2041,7 +2085,7 @@ impl PageServerHandler { //return Err(QueryError::NotFound("timeline is archived".into())) } - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); @@ -2121,10 +2165,12 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } - writer - .flush() - .await - .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; + writer.flush().await.map_err(|e| { + map_basebackup_error(BasebackupError::Client( + e, + "handle_basebackup_request,flush", + )) + })?; } pgb.write_message_noflush(&BeMessage::CopyDone) @@ -2424,9 +2470,16 @@ where fn startup( &mut self, _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, + sm: &FeStartupPacket, ) -> Result<(), QueryError> { fail::fail_point!("ps::connection-start::startup-packet"); + + if let FeStartupPacket::StartupMessage { params, .. } = sm { + if let Some(app_name) = params.get("application_name") { + Span::current().record("application_name", field::display(app_name)); + } + }; + Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 00f332d797..ae2762bd1e 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -23,13 +23,14 @@ use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; -use pageserver_api::key::Key; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, - slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, + rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range, + slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, + twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY, + CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::key::{rel_tag_sparse_key, Key}; use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; @@ -490,12 +491,33 @@ impl Timeline { if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } - // fetch directory listing + + // Read path: first read the new reldir keyspace. Early return if the relation exists. + // Otherwise, read the old reldir keyspace. + // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. + + if self.get_rel_size_v2_enabled() { + // fetch directory listing (new) + let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); + let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + let exists_v2 = buf == RelDirExists::Exists; + // Fast path: if the relation exists in the new format, return true. + // TODO: we should have a verification mode that checks both keyspaces + // to ensure the relation only exists in one of them. + if exists_v2 { + return Ok(true); + } + } + + // fetch directory listing (old) + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) + let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum)); + Ok(exists_v1) } /// Get a list of all existing relations in given tablespace and database. @@ -513,12 +535,12 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - // fetch directory listing + // fetch directory listing (old) let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - let rels: HashSet = + let rels_v1: HashSet = HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { spcnode, dbnode, @@ -526,6 +548,46 @@ impl Timeline { forknum: *forknum, })); + if !self.get_rel_size_v2_enabled() { + return Ok(rels_v1); + } + + // scan directory listing (new), merge with the old results + let key_range = rel_tag_sparse_key_range(spcnode, dbnode); + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + let results = self + .scan( + KeySpace::single(key_range), + version.get_lsn(), + ctx, + io_concurrency, + ) + .await?; + let mut rels = rels_v1; + for (key, val) in results { + let val = RelDirExists::decode(&val?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + assert_eq!(key.field6, 1); + assert_eq!(key.field2, spcnode); + assert_eq!(key.field3, dbnode); + let tag = RelTag { + spcnode, + dbnode, + relnode: key.field4, + forknum: key.field5, + }; + if val == RelDirExists::Removed { + debug_assert!(!rels.contains(&tag), "removed reltag in v2"); + continue; + } + let did_not_contain = rels.insert(tag); + debug_assert!(did_not_contain, "duplicate reltag in v2"); + } Ok(rels) } @@ -611,7 +673,7 @@ impl Timeline { ) -> Result { pausable_failpoint!("find-lsn-for-timestamp-pausable"); - let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); + let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); let gc_cutoff_planned = { let gc_info = self.gc_info.read().unwrap(); gc_info.min_cutoff() @@ -1144,7 +1206,11 @@ impl Timeline { let dense_keyspace = result.to_keyspace(); let sparse_keyspace = SparseKeySpace(KeySpace { - ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + ranges: vec![ + Key::metadata_aux_key_range(), + repl_origin_key_range(), + Key::rel_dir_sparse_key_range(), + ], }); if cfg!(debug_assertions) { @@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> { /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. - pending_directory_entries: Vec<(DirectoryKind, usize)>, + pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>, /// An **approximation** of how many metadata bytes will be written to the EphemeralFile. pending_metadata_bytes: usize, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MetricsUpdate { + /// Set the metrics to this value + Set(u64), + /// Increment the metrics by this value + Add(u64), + /// Decrement the metrics by this value + Sub(u64), +} + impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we @@ -1359,7 +1435,8 @@ impl DatadirModification<'_> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Db, 0)); + self.pending_directory_entries + .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); let buf = if self.tline.pg_version >= 17 { @@ -1372,7 +1449,7 @@ impl DatadirModification<'_> { }) }?; self.pending_directory_entries - .push((DirectoryKind::TwoPhase, 0)); + .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0))); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); @@ -1382,17 +1459,23 @@ impl DatadirModification<'_> { // harmless but they'd just be dropped on later compaction. if self.tline.tenant_shard_id.is_shard_zero() { self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), + MetricsUpdate::Set(0), + )); } Ok(()) @@ -1658,10 +1741,16 @@ impl DatadirModification<'_> { } if r.is_none() { // Create RelDirectory + // TODO: if we have fully migrated to v2, no need to create this directory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Rel, 0)); + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + if self.tline.get_rel_size_v2_enabled() { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + } self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1685,8 +1774,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid = xid as u32; @@ -1694,8 +1785,10 @@ impl DatadirModification<'_> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -1744,8 +1837,10 @@ impl DatadirModification<'_> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; - self.pending_directory_entries - .push((DirectoryKind::Db, dir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1778,39 +1873,85 @@ impl DatadirModification<'_> { // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; - let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = + + let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dbdir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); - - // and create the RelDirectory - RelDirectory::default() + false } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + true }; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if !dbdir_exists { + // Create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; + // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } - self.pending_directory_entries - .push((DirectoryKind::Rel, rel_dir.rels.len())); - - self.put( - rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), - ); - + if self.tline.get_rel_size_v2_enabled() { + let sparse_rel_dir_key = + rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); + // check if the rel_dir_key exists in v2 + let val = self + .sparse_get(sparse_rel_dir_key, ctx) + .await + .map_err(|e| RelationError::Other(e.into()))?; + let val = RelDirExists::decode_option(val) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + return Err(RelationError::AlreadyExists); + } + self.put( + sparse_rel_dir_key, + Value::Image(RelDirExists::Exists.encode()), + ); + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation. + // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there + // will be key not found errors if we don't create an empty one for rel_size_v2. + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&RelDirectory::default()).context("serialize")?, + )), + ); + } + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); + } else { + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) + } + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&rel_dir).context("serialize")?, + )), + ); + } // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -1896,9 +2037,34 @@ impl DatadirModification<'_> { let mut dirty = false; for rel_tag in rel_tags { - if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; + true + } else if self.tline.get_rel_size_v2_enabled() { + // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. + // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion + // logic). + let key = + rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); + let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); + // put tombstone + self.put(key, Value::Image(RelDirExists::Removed.encode())); + // no need to set dirty to true + true + } else { + false + } + } else { + false + }; + if found { // update logical size let size_key = rel_size_to_key(rel_tag); let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1914,8 +2080,6 @@ impl DatadirModification<'_> { if dirty { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); - self.pending_directory_entries - .push((DirectoryKind::Rel, dir.rels.len())); } } @@ -1939,8 +2103,10 @@ impl DatadirModification<'_> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1987,8 +2153,10 @@ impl DatadirModification<'_> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -2020,8 +2188,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid: u32 = u32::try_from(xid)?; @@ -2030,8 +2200,10 @@ impl DatadirModification<'_> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -2147,7 +2319,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } Ok(()) @@ -2233,7 +2405,7 @@ impl DatadirModification<'_> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } self.pending_metadata_bytes = 0; @@ -2297,6 +2469,22 @@ impl DatadirModification<'_> { self.tline.get(key, lsn, ctx).await } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2379,6 +2567,23 @@ impl Version<'_> { } } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(timeline, key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn get_lsn(&self) -> Lsn { match self { Version::Lsn(lsn) => *lsn, @@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind { Rel, AuxFiles, SlruSegment(SlruKind), + RelV2, } impl DirectoryKind { diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 622738022a..cc93a06ccd 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -328,8 +328,8 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, - // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) - IngestHousekeeping, + // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.). + TenantHousekeeping, /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3c6996dd51..5d917da574 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,6 +20,7 @@ use chrono::NaiveDateTime; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; +use itertools::Itertools as _; use pageserver_api::models; use pageserver_api::models::CompactInfoResponse; use pageserver_api::models::LsnLease; @@ -39,6 +40,8 @@ use remote_timeline_client::manifest::{ use remote_timeline_client::UploadQueueNotReadyError; use remote_timeline_client::FAILED_REMOTE_OP_RETRIES; use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD; +use secondary::heatmap::HeatMapTenant; +use secondary::heatmap::HeatMapTimeline; use std::collections::BTreeMap; use std::fmt; use std::future::Future; @@ -51,10 +54,14 @@ use timeline::compaction::GcCompactionQueue; use timeline::import_pgdata; use timeline::offload::offload_timeline; use timeline::offload::OffloadError; +use timeline::CompactFlags; use timeline::CompactOptions; +use timeline::CompactionError; +use timeline::PreviousHeatmap; use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; +use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; @@ -258,6 +265,7 @@ struct TimelinePreload { timeline_id: TimelineId, client: RemoteTimelineClient, index_part: Result, + previous_heatmap: Option, } pub(crate) struct TenantPreload { @@ -349,6 +357,9 @@ pub struct Tenant { /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, + /// Signals the tenant compaction loop that there is L0 compaction work to be done. + pub(crate) l0_compaction_trigger: Arc, + /// Scheduled gc-compaction tasks. scheduled_compaction_tasks: std::sync::Mutex>>, @@ -1121,6 +1132,7 @@ impl Tenant { resources: TimelineResources, mut index_part: IndexPart, metadata: TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1151,6 +1163,7 @@ impl Tenant { let timeline = self.create_timeline_struct( timeline_id, &metadata, + previous_heatmap, ancestor.clone(), resources, CreateTimelineCause::Load, @@ -1550,8 +1563,18 @@ impl Tenant { } } + // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even + // pulled the first heatmap. Not entirely necessary since the storage controller + // will kick the secondary in any case and cause a download. + let maybe_heatmap_at = self.read_on_disk_heatmap().await; + let timelines = self - .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) + .load_timelines_metadata( + remote_timeline_ids, + remote_storage, + maybe_heatmap_at, + cancel, + ) .await?; Ok(TenantPreload { @@ -1564,6 +1587,26 @@ impl Tenant { }) } + async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { + let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); + match tokio::fs::read_to_string(on_disk_heatmap_path).await { + Ok(heatmap) => match serde_json::from_str::(&heatmap) { + Ok(heatmap) => Some((heatmap, std::time::Instant::now())), + Err(err) => { + error!("Failed to deserialize old heatmap: {err}"); + None + } + }, + Err(err) => match err.kind() { + std::io::ErrorKind::NotFound => None, + _ => { + error!("Unexpected IO error reading old heatmap: {err}"); + None + } + }, + } + } + /// /// Background task that downloads all data for a tenant and brings it to Active state. /// @@ -1651,7 +1694,10 @@ impl Tenant { match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); - remote_index_and_client.insert(timeline_id, (index_part, preload.client)); + remote_index_and_client.insert( + timeline_id, + (index_part, preload.client, preload.previous_heatmap), + ); } MaybeDeletedIndexPart::Deleted(index_part) => { info!( @@ -1670,7 +1716,7 @@ impl Tenant { // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?; for (timeline_id, remote_metadata) in sorted_timelines { - let (index_part, remote_client) = remote_index_and_client + let (index_part, remote_client, previous_heatmap) = remote_index_and_client .remove(&timeline_id) .expect("just put it in above"); @@ -1690,12 +1736,8 @@ impl Tenant { timeline_id, index_part, remote_metadata, - TimelineResources { - remote_client, - pagestream_throttle: self.pagestream_throttle.clone(), - pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, + previous_heatmap, + self.get_timeline_resources_for(remote_client), LoadTimelineCause::Attach, ctx, ) @@ -1844,11 +1886,13 @@ impl Tenant { } #[instrument(skip_all, fields(timeline_id=%timeline_id))] + #[allow(clippy::too_many_arguments)] async fn load_remote_timeline( self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, + previous_heatmap: Option, resources: TimelineResources, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1878,6 +1922,7 @@ impl Tenant { resources, index_part, remote_metadata, + previous_heatmap, ancestor, cause, ctx, @@ -1889,14 +1934,29 @@ impl Tenant { self: &Arc, timeline_ids: HashSet, remote_storage: &GenericRemoteStorage, + heatmap: Option<(HeatMapTenant, std::time::Instant)>, cancel: CancellationToken, ) -> anyhow::Result> { + let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1)); + let mut part_downloads = JoinSet::new(); for timeline_id in timeline_ids { let cancel_clone = cancel.clone(); + + let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| { + hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { + heatmap: h, + read_at: hs.1, + }) + }); part_downloads.spawn( - self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone) - .instrument(info_span!("download_index_part", %timeline_id)), + self.load_timeline_metadata( + timeline_id, + remote_storage.clone(), + previous_timeline_heatmap, + cancel_clone, + ) + .instrument(info_span!("download_index_part", %timeline_id)), ); } @@ -1944,6 +2004,7 @@ impl Tenant { self: &Arc, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, + previous_heatmap: Option, cancel: CancellationToken, ) -> impl Future { let client = self.build_timeline_client(timeline_id, remote_storage); @@ -1959,6 +2020,7 @@ impl Tenant { client, timeline_id, index_part, + previous_heatmap, } } } @@ -2070,7 +2132,12 @@ impl Tenant { })?; let timeline_preload = self - .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone()) + .load_timeline_metadata( + timeline_id, + self.remote_storage.clone(), + None, + cancel.clone(), + ) .await; let index_part = match timeline_preload.index_part { @@ -2104,6 +2171,7 @@ impl Tenant { timeline_id, index_part, remote_metadata, + None, timeline_resources, LoadTimelineCause::Unoffload, &ctx, @@ -2819,7 +2887,7 @@ impl Tenant { }; let metadata = index_part.metadata.clone(); self - .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ create_guard: timeline_create_guard, activate, }, &ctx) .await? .ready_to_activate() @@ -2898,150 +2966,197 @@ impl Tenant { .await } - /// Perform one compaction iteration. - /// This function is periodically called by compactor task. - /// Also it can be explicitly requested per timeline through page server - /// api's 'compact' command. + /// Performs one compaction iteration. Called periodically from the compaction loop. Returns + /// whether another compaction is needed, if we still have pending work or if we yield for + /// immediate L0 compaction. /// - /// Returns whether we have pending compaction task. + /// Compaction can also be explicitly requested for a timeline via the HTTP API. async fn compaction_iteration( self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { - // Don't start doing work during shutdown, or when broken, we do not need those in the logs + ) -> Result { + // Don't compact inactive tenants. if !self.is_active() { - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } - { - let conf = self.tenant_conf.load(); - - // Note that compaction usually requires deletions, but we don't respect - // may_delete_layers_hint here: that is because tenants in AttachedMulti - // should proceed with compaction even if they can't do deletion, to avoid - // accumulating dangerously deep stacks of L0 layers. Deletions will be - // enqueued inside RemoteTimelineClient, and executed layer if/when we transition - // to AttachedSingle state. - if !conf.location.may_upload_layers_hint() { - info!("Skipping compaction in location state {:?}", conf.location); - return Ok(CompactionOutcome::Done); - } + // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`, + // since we need to compact L0 even in AttachedMulti to bound read amplification. + let location = self.tenant_conf.load().location; + if !location.may_upload_layers_hint() { + info!("skipping compaction in location state {location:?}"); + return Ok(CompactionOutcome::Skipped); } - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines_to_compact_or_offload; - { - let timelines = self.timelines.lock().unwrap(); - timelines_to_compact_or_offload = timelines - .iter() - .filter_map(|(timeline_id, timeline)| { - let (is_active, (can_offload, _)) = - (timeline.is_active(), timeline.can_offload()); - let has_no_unoffloaded_children = { - !timelines - .iter() - .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) - }; - let config_allows_offload = self.conf.timeline_offloading - || self - .tenant_conf - .load() - .tenant_conf - .timeline_offloading - .unwrap_or_default(); - let can_offload = - can_offload && has_no_unoffloaded_children && config_allows_offload; - if (is_active, can_offload) == (false, false) { - None - } else { - Some((*timeline_id, timeline.clone(), (is_active, can_offload))) - } - }) - .collect::>(); - drop(timelines); - } - - // Before doing any I/O work, check our circuit breaker + // Don't compact if the circuit breaker is tripped. if self.compaction_circuit_breaker.lock().unwrap().is_broken() { - info!("Skipping compaction due to previous failures"); - return Ok(CompactionOutcome::Done); + info!("skipping compaction due to previous failures"); + return Ok(CompactionOutcome::Skipped); } - let mut has_pending_task = false; + // Collect all timelines to compact, along with offload instructions and L0 counts. + let mut compact: Vec> = Vec::new(); + let mut offload: HashSet = HashSet::new(); + let mut l0_counts: HashMap = HashMap::new(); - for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload { - // pending_task_left == None: cannot compact, maybe still pending tasks - // pending_task_left == Some(Pending): compaction task left - // pending_task_left == Some(Done): no compaction task left - let pending_task_left = if *can_compact { - let compaction_outcome = timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await - .inspect_err(|e| match e { - timeline::CompactionError::ShuttingDown => (), - timeline::CompactionError::Offload(_) => { - // Failures to offload timelines do not trip the circuit breaker, because - // they do not do lots of writes the way compaction itself does: it is cheap - // to retry, and it would be bad to stop all compaction because of an issue with offloading. - } - timeline::CompactionError::Other(e) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, e); - } - })?; - if let CompactionOutcome::Pending = compaction_outcome { - Some(CompactionOutcome::Pending) - } else { - let queue = { - let guard = self.scheduled_compaction_tasks.lock().unwrap(); - guard.get(timeline_id).cloned() - }; - if let Some(queue) = queue { - let outcome = queue - .iteration(cancel, ctx, &self.gc_block, timeline) - .await?; - Some(outcome) - } else { - Some(CompactionOutcome::Done) - } + let offload_enabled = self.get_timeline_offloading_enabled(); + let timelines = self.timelines.lock().unwrap(); + for (&timeline_id, timeline) in timelines.iter() { + // Skip inactive timelines. + if !timeline.is_active() { + continue; } - } else { - None - }; - has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending); - if pending_task_left == Some(CompactionOutcome::Done) && *can_offload { - pausable_failpoint!("before-timeline-auto-offload"); - match offload_timeline(self, timeline) - .instrument(info_span!("offload_timeline", %timeline_id)) - .await - { - Err(OffloadError::NotArchived) => { - // Ignore this, we likely raced with unarchival - Ok(()) - } - other => other, - }?; + + // Schedule the timeline for compaction. + compact.push(timeline.clone()); + + // Schedule the timeline for offloading if eligible. + let can_offload = offload_enabled + && timeline.can_offload().0 + && !timelines + .iter() + .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id)); + if can_offload { + offload.insert(timeline_id); + } + } + } // release timelines lock + + for timeline in &compact { + // Collect L0 counts. Can't await while holding lock above. + if let Ok(lm) = timeline.layers.read().await.layer_map() { + l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len()); } } + // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to + // bound read amplification. + // + // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC + // compaction and offloading. We leave that as a potential problem to solve later. Consider + // splitting L0 and image/GC compaction to separate background jobs. + if self.get_compaction_l0_first() { + let compaction_threshold = self.get_compaction_threshold(); + let compact_l0 = compact + .iter() + .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0))) + .filter(|&(_, l0)| l0 >= compaction_threshold) + .sorted_by_key(|&(_, l0)| l0) + .rev() + .map(|(tli, _)| tli.clone()) + .collect_vec(); + + let mut has_pending_l0 = false; + for timeline in compact_l0 { + let outcome = timeline + .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) + .await + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending_l0 = true, + CompactionOutcome::YieldForL0 => has_pending_l0 = true, + } + } + if has_pending_l0 { + return Ok(CompactionOutcome::YieldForL0); // do another pass + } + } + + // Pass 2: image compaction and timeline offloading. If any timelines have accumulated + // more L0 layers, they may also be compacted here. + // + // NB: image compaction may yield if there is pending L0 compaction. + // + // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a + // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`. + // We leave this for a later PR. + // + // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, + // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. + let mut has_pending = false; + for timeline in compact { + if !timeline.is_active() { + continue; + } + + let mut outcome = timeline + .compact(cancel, EnumSet::default(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) + .await + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + + // If we're done compacting, check the scheduled GC compaction queue for more work. + if outcome == CompactionOutcome::Done { + let queue = self + .scheduled_compaction_tasks + .lock() + .unwrap() + .get(&timeline.timeline_id) + .cloned(); + if let Some(queue) = queue { + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) + .await?; + } + } + + // If we're done compacting, offload the timeline if requested. + if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) { + pausable_failpoint!("before-timeline-auto-offload"); + offload_timeline(self, &timeline) + .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id)) + .await + .or_else(|err| match err { + // Ignore this, we likely raced with unarchival. + OffloadError::NotArchived => Ok(()), + err => Err(err), + })?; + } + + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending = true, + // This mostly makes sense when the L0-only pass above is enabled, since there's + // otherwise no guarantee that we'll start with the timeline that has high L0. + CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0), + } + } + + // Success! Untrip the breaker if necessary. self.compaction_circuit_breaker .lock() .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(if has_pending_task { - CompactionOutcome::Pending - } else { - CompactionOutcome::Done - }) + match has_pending { + true => Ok(CompactionOutcome::Pending), + false => Ok(CompactionOutcome::Done), + } + } + + /// Trips the compaction circuit breaker if appropriate. + pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { + match err { + CompactionError::ShuttingDown => (), + // Offload failures don't trip the circuit breaker, since they're cheap to retry and + // shouldn't block compaction. + CompactionError::Offload(_) => {} + CompactionError::Other(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } + } } /// Cancel scheduled compaction tasks @@ -3088,32 +3203,28 @@ impl Tenant { Ok(rx) } - // Call through to all timelines to freeze ephemeral layers if needed. Usually - // this happens during ingest: this background housekeeping is for freezing layers - // that are open but haven't been written to for some time. - async fn ingest_housekeeping(&self) { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines = { - self.timelines - .lock() - .unwrap() - .values() - .filter_map(|timeline| { - if timeline.is_active() { - Some(timeline.clone()) - } else { - None - } - }) - .collect::>() - }; + /// Performs periodic housekeeping, via the tenant housekeeping background task. + async fn housekeeping(&self) { + // Call through to all timelines to freeze ephemeral layers as needed. This usually happens + // during ingest, but we don't want idle timelines to hold open layers for too long. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tli| tli.is_active()) + .cloned() + .collect_vec(); - for timeline in &timelines { + for timeline in timelines { timeline.maybe_freeze_ephemeral_layer().await; } + + // Shut down walredo if idle. + const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180); + if let Some(ref walredo_mgr) = self.walredo_mgr { + walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); + } } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -3816,6 +3927,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + pub fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -3823,6 +3941,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -3877,6 +4002,16 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } + pub fn get_timeline_offloading_enabled(&self) -> bool { + if self.conf.timeline_offloading { + return true; + } + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .timeline_offloading + .unwrap_or(self.conf.default_tenant_conf.timeline_offloading) + } + /// Generate an up-to-date TenantManifest based on the state of this Tenant. fn build_tenant_manifest(&self) -> TenantManifest { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); @@ -3971,6 +4106,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, @@ -3994,6 +4130,7 @@ impl Tenant { self.conf, Arc::clone(&self.tenant_conf), new_metadata, + previous_heatmap, ancestor, new_timeline_id, self.tenant_shard_id, @@ -4115,6 +4252,7 @@ impl Tenant { // use an extremely long backoff. Some(Duration::from_secs(3600 * 24)), )), + l0_compaction_trigger: Arc::new(Notify::new()), scheduled_compaction_tasks: Mutex::new(Default::default()), activate_now_sem: tokio::sync::Semaphore::new(0), attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), @@ -4635,24 +4773,24 @@ impl Tenant { // We check it against both the planned GC cutoff stored in 'gc_info', // and the 'latest_gc_cutoff' of the last GC that was performed. The // planned GC cutoff in 'gc_info' is normally larger than - // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just + // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just // changed the GC settings for the tenant to make the PITR window // larger, but some of the data was already removed by an earlier GC // iteration. // check against last actual 'latest_gc_cutoff' first - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn(); { let gc_info = src_timeline.gc_info.read().unwrap(); let planned_cutoff = gc_info.min_cutoff(); if gc_info.lsn_covered_by_lease(start_lsn) { - tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn); + tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn); } else { src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, + *applied_gc_cutoff_lsn, )) .map_err(CreateTimelineError::AncestorLsn)?; @@ -4691,7 +4829,7 @@ impl Tenant { dst_prev, Some(src_id), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? + *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, ); @@ -5023,12 +5161,19 @@ impl Tenant { ) } - /// Call this before constructing a timeline, to build its required structures + /// Builds required resources for a new timeline. fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { + let remote_client = self.build_timeline_remote_client(timeline_id); + self.get_timeline_resources_for(remote_client) + } + + /// Builds timeline resources for the given remote client. + fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources { TimelineResources { - remote_client: self.build_timeline_remote_client(timeline_id), + remote_client, pagestream_throttle: self.pagestream_throttle.clone(), pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), + l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -5057,6 +5202,7 @@ impl Tenant { .create_timeline_struct( new_timeline_id, new_metadata, + None, ancestor, resources, CreateTimelineCause::Load, @@ -5474,6 +5620,8 @@ pub(crate) mod harness { compaction_threshold: Some(tenant_conf.compaction_threshold), compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), compaction_algorithm: Some(tenant_conf.compaction_algorithm), + compaction_l0_first: Some(tenant_conf.compaction_l0_first), + compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore), l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload), @@ -5502,7 +5650,7 @@ pub(crate) mod harness { lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, - rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled, + rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled), gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), gc_compaction_initial_threshold_kb: Some( tenant_conf.gc_compaction_initial_threshold_kb, @@ -6061,8 +6209,8 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn(); + assert!(*applied_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { Ok(_) => panic!("request for page should have failed"), Err(err) => assert!(err.to_string().contains("not found at")), @@ -8344,7 +8492,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -8452,7 +8600,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -8620,8 +8768,8 @@ mod tests { // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( - "latest_gc_cutoff_lsn: {}", - *timeline.get_latest_gc_cutoff_lsn() + "applied_gc_cutoff_lsn: {}", + *timeline.get_applied_gc_cutoff_lsn() ); timeline.force_set_disk_consistent_lsn(end_lsn); @@ -8647,7 +8795,7 @@ mod tests { // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff - assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); + assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn()); timeline .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) .expect_err("lease request on GC-ed LSN should fail"); @@ -8838,7 +8986,7 @@ mod tests { }; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -8925,7 +9073,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -9378,7 +9526,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9525,7 +9673,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x38)) .wait() @@ -9626,7 +9774,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9877,7 +10025,7 @@ mod tests { { parent_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x10)) .wait() @@ -9897,7 +10045,7 @@ mod tests { { branch_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x50)) .wait() @@ -10253,7 +10401,7 @@ mod tests { { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -10638,7 +10786,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -10889,7 +11037,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 972837dc44..c6bcfdf2fb 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -285,6 +285,14 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_first: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_semaphore: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub l0_flush_delay_threshold: Option, @@ -416,6 +424,12 @@ impl TenantConfOpt { .as_ref() .unwrap_or(&global_conf.compaction_algorithm) .clone(), + compaction_l0_first: self + .compaction_l0_first + .unwrap_or(global_conf.compaction_l0_first), + compaction_l0_semaphore: self + .compaction_l0_semaphore + .unwrap_or(global_conf.compaction_l0_semaphore), l0_flush_delay_threshold: self .l0_flush_delay_threshold .or(global_conf.l0_flush_delay_threshold), @@ -466,12 +480,14 @@ impl TenantConfOpt { .lsn_lease_length_for_ts .unwrap_or(global_conf.lsn_lease_length_for_ts), timeline_offloading: self - .lazy_slru_download + .timeline_offloading .unwrap_or(global_conf.timeline_offloading), wal_receiver_protocol_override: self .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), - rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled), + rel_size_v2_enabled: self + .rel_size_v2_enabled + .unwrap_or(global_conf.rel_size_v2_enabled), gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), @@ -493,6 +509,8 @@ impl TenantConfOpt { mut compaction_threshold, mut compaction_upper_limit, mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, mut l0_flush_wait_upload, @@ -538,6 +556,10 @@ impl TenantConfOpt { .compaction_upper_limit .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); patch .l0_flush_delay_threshold .apply(&mut l0_flush_delay_threshold); @@ -619,6 +641,8 @@ impl TenantConfOpt { compaction_threshold, compaction_upper_limit, compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, l0_flush_wait_upload, @@ -681,6 +705,8 @@ impl From for models::TenantConfig { compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, + compaction_l0_first: value.compaction_l0_first, + compaction_l0_semaphore: value.compaction_l0_semaphore, l0_flush_delay_threshold: value.l0_flush_delay_threshold, l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index d281eb305f..15c6955260 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 { prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, + + // The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`]. latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, pg_version: u32, } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index dfa89a765c..22ee560dbf 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2816,8 +2816,8 @@ where } use { - crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest, - utils::http::error::ApiError, + crate::tenant::gc_result::GcResult, http_utils::error::ApiError, + pageserver_api::models::TimelineGcRequest, }; #[cfg(test)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index ad6d8dfae8..713efbb9a4 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -517,7 +517,7 @@ impl RemoteTimelineClient { if let Ok(queue) = queue_locked.initialized_mut() { let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); for d in blocked_deletions { - if let Err(e) = self.deletion_queue_client.push_layers_sync( + if let Err(e) = self.deletion_queue_client.push_layers( self.tenant_shard_id, self.timeline_id, self.generation, @@ -2151,7 +2151,6 @@ impl RemoteTimelineClient { self.generation, delete.layers.clone(), ) - .await .map_err(|e| anyhow::anyhow!(e)) } } diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 4a8e66d38a..0fa10ca294 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use std::{collections::HashMap, time::SystemTime}; use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; @@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; use utils::{generation::Generation, id::TimelineId}; #[derive(Serialize, Deserialize)] -pub(super) struct HeatMapTenant { +pub(crate) struct HeatMapTenant { /// Generation of the attached location that uploaded the heatmap: this is not required /// for correctness, but acts as a hint to secondary locations in order to detect thrashing /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. @@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant { pub(super) upload_period_ms: Option, } +impl HeatMapTenant { + pub(crate) fn into_timelines_index(self) -> HashMap { + self.timelines + .into_iter() + .map(|htl| (htl.timeline_id, htl)) + .collect() + } +} + #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, @@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline { } #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapLayer { pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] - pub(super) access_time: SystemTime, + pub(crate) access_time: SystemTime, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 6c3276ea3c..1e84a9d9dc 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -394,7 +394,7 @@ pub(super) async fn gather_inputs( ancestor_lsn, last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(), next_pitr_cutoff, retention_param_cutoff, lease_points, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3800852ccc..f9f843ef6b 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; use self::inmemory_layer::InMemoryLayerFileId; -use super::timeline::GetVectoredError; +use super::timeline::{GetVectoredError, ReadPath}; use super::PageReconstructError; pub fn range_overlaps(a: &Range, b: &Range) -> bool @@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState { pub(crate) io_concurrency: IoConcurrency, num_active_ios: Arc, + + pub(crate) read_path: Option, } /// The level of IO concurrency to be used on the read path @@ -609,6 +611,7 @@ impl ValuesReconstructState { delta_layers_visited: 0, io_concurrency, num_active_ios: Arc::new(AtomicUsize::new(0)), + read_path: None, } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 92313afba7..0bf606cf0a 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -136,6 +136,22 @@ pub(crate) fn local_layer_path( } } +pub(crate) enum LastEviction { + Never, + At(std::time::Instant), + Evicting, +} + +impl LastEviction { + pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool { + match self { + LastEviction::Never => false, + LastEviction::At(evicted_at) => evicted_at > &timepoint, + LastEviction::Evicting => true, + } + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( @@ -353,7 +369,6 @@ impl Layer { /// while the guard exists. /// /// Returns None if the layer is currently evicted or becoming evicted. - #[cfg(test)] pub(crate) async fn keep_resident(&self) -> Option { let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; @@ -406,6 +421,17 @@ impl Layer { self.0.metadata() } + pub(crate) fn last_evicted_at(&self) -> LastEviction { + match self.0.last_evicted_at.try_lock() { + Ok(lock) => match *lock { + None => LastEviction::Never, + Some(at) => LastEviction::At(at), + }, + Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting, + Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"), + } + } + pub(crate) fn get_timeline_id(&self) -> Option { self.0 .timeline @@ -530,7 +556,6 @@ impl ResidentOrWantedEvicted { /// This is not used on the read path (anything that calls /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. - #[cfg(test)] fn get(&self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), @@ -658,7 +683,9 @@ struct LayerInner { /// When the Layer was last evicted but has not been downloaded since. /// - /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. + /// This is used for skipping evicted layers from the previous heatmap (see + /// `[Timeline::generate_heatmap]`) and for updating metrics + /// (see [`LayerImplMetrics::redownload_after`]). last_evicted_at: std::sync::Mutex>, #[cfg(test)] diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 0f10dd7e10..029444e973 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -1,48 +1,64 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as compaction and GC +//! This module contains per-tenant background processes, e.g. compaction and GC. -use std::ops::ControlFlow; -use std::str::FromStr; -use std::sync::{Arc, Mutex}; +use std::cmp::max; +use std::future::Future; +use std::ops::{ControlFlow, RangeInclusive}; +use std::pin::pin; +use std::sync::Arc; use std::time::{Duration, Instant}; +use once_cell::sync::Lazy; +use rand::Rng; +use scopeguard::defer; +use tokio::sync::{Semaphore, SemaphorePermit}; +use tokio_util::sync::CancellationToken; +use tracing::*; + use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; -use crate::task_mgr; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS}; use crate::tenant::throttle::Stats; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; -use once_cell::sync::Lazy; -use rand::Rng; -use tokio_util::sync::CancellationToken; -use tracing::*; -use utils::rate_limit::RateLimit; -use utils::{backoff, completion, pausable_failpoint}; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; +use utils::backoff::exponential_backoff_duration; +use utils::completion::Barrier; +use utils::pausable_failpoint; -static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); - let permits = usize::max( - 1, - // while a lot of the work is done on spawn_blocking, we still do - // repartitioning in the async context. this should give leave us some workers - // unblocked to be blocked on other work, hopefully easing any outside visible - // effects of restarts. - // - // 6/8 is a guess; previously we ran with unlimited 8 and more from - // spawn_blocking. - (total_threads * 3).checked_div(4).unwrap_or(0), - ); - assert_ne!(permits, 0, "we will not be adding in permits later"); - assert!( - permits < total_threads, - "need threads avail for shorter work" - ); - tokio::sync::Semaphore::new(permits) - }); +/// Semaphore limiting concurrent background tasks (across all tenants). +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. +static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); +/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if +/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled. +/// +/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive +/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less +/// important (e.g. due to page images in delta layers) and can wait for other background tasks. +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note +/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same +/// thread pool. +static CONCURRENT_L0_COMPACTION_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); + +/// Background jobs. +/// +/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that +/// do any significant IO or CPU work. #[derive( Debug, PartialEq, @@ -55,10 +71,13 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy { - _permit: tokio::sync::SemaphorePermit<'static>, + _permit: SemaphorePermit<'static>, _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>, } -/// Cancellation safe. -pub(crate) async fn concurrent_background_tasks_rate_limit_permit( +/// Acquires a semaphore permit, to limit concurrent background jobs. +pub(crate) async fn acquire_concurrency_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, ) -> BackgroundLoopSemaphorePermit<'static> { - // TODO: use a lower threshold and remove the pacer once we resolve some blockage. - const WARN_THRESHOLD: Duration = Duration::from_secs(600); - static WARN_PACER: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - - let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind); + let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind); if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { pausable_failpoint!("initial-size-calculation-permit-pause"); } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); - let permit = CONCURRENT_BACKGROUND_TASKS - .acquire() - .await - .expect("should never close"); + let semaphore = match loop_kind { + BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS, + _ => &CONCURRENT_BACKGROUND_TASKS, + }; + let permit = semaphore.acquire().await.expect("should never close"); - let waited = recorder.acquired(); - if waited >= WARN_THRESHOLD { - let waited = waited.as_secs_f64(); - WARN_PACER - .lock() - .unwrap() - .call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit")); - } + recorder.acquired(); BackgroundLoopSemaphorePermit { _permit: permit, @@ -108,12 +116,10 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( } } -/// Start per tenant background loops: compaction and gc. -pub fn start_background_loops( - tenant: &Arc, - background_jobs_can_start: Option<&completion::Barrier>, -) { +/// Start per tenant background loops: compaction, GC, and ingest housekeeping. +pub fn start_background_loops(tenant: &Arc, can_start: Option<&Barrier>) { let tenant_shard_id = tenant.tenant_shard_id; + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -122,13 +128,15 @@ pub fn start_background_loops( &format!("compactor for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); compaction_loop(tenant, cancel) // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) @@ -137,6 +145,7 @@ pub fn start_background_loops( } }, ); + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, @@ -145,13 +154,15 @@ pub fn start_background_loops( &format!("garbage collector for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); gc_loop(tenant, cancel) .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; @@ -162,21 +173,23 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::IngestHousekeeping, + TaskKind::TenantHousekeeping, tenant_shard_id, None, - &format!("ingest housekeeping for tenant {tenant_shard_id}"), + &format!("housekeeping for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; - ingest_housekeeping_loop(tenant, cancel) - .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); + tenant_housekeeping_loop(tenant, cancel) + .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -184,372 +197,292 @@ pub fn start_background_loops( ); } -/// -/// Compaction task's main loop -/// +/// Compaction task's main loop. async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { + const BASE_BACKOFF_SECS: f64 = 1.0; const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10); - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); - let mut first = true; - loop { + let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut period = tenant.get_compaction_period(); + let mut error_run = 0; // consecutive errors + + // Stagger the compaction loop across tenants. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + if sleep_random(period, &cancel).await.is_err() { + return; + } + + loop { + // Recheck that we're still active. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Refresh the period. If compaction is disabled, check again in a bit. + period = tenant.get_compaction_period(); + if period == Duration::ZERO { + #[cfg(not(feature = "testing"))] + info!("automatic compaction is disabled"); tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, + _ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {}, + _ = cancel.cancelled() => return, } + continue; + } - let period = tenant.get_compaction_period(); + // Wait for the next compaction run. + let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + tokio::select! { + _ = tokio::time::sleep(backoff), if error_run > 0 => {}, + _ = tokio::time::sleep(period), if error_run == 0 => {}, + _ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {}, + _ = cancel.cancelled() => return, + } - // TODO: we shouldn't need to await to find tenant and this could be moved outside of - // loop, #3501. There are also additional "allowed_errors" in tests. - if first { - first = false; - if random_init_delay(period, &cancel).await.is_err() { - break; + // Run compaction. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + let IterationResult { output, elapsed } = iteration + .run(tenant.compaction_iteration(&cancel, &ctx)) + .await; + + match output { + Ok(outcome) => { + error_run = 0; + // If there's more compaction work, L0 or not, schedule an immediate run. + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(), + CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(), } } - let sleep_duration; - if period == Duration::ZERO { - #[cfg(not(feature = "testing"))] - info!("automatic compaction is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10) - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Compaction, - }; - - // Run compaction - let IterationResult { output, elapsed } = iteration - .run(tenant.compaction_iteration(&cancel, &ctx)) - .await; - match output { - Ok(outcome) => { - error_run_count = 0; - // schedule the next compaction immediately in case there is a pending compaction task - sleep_duration = if let CompactionOutcome::Pending = outcome { - Duration::from_secs(1) - } else { - period - }; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - sleep_duration = wait_duration; - } - } - - // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!( - elapsed_ms = elapsed.as_millis(), - "compaction iteration complete" - ); - }; - - // Perhaps we did no work and the walredo process has been idle for some time: - // give it a chance to shut down to avoid leaving walredo process running indefinitely. - // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off, - // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. - if let Some(walredo_mgr) = &tenant.walredo_mgr { - walredo_mgr.maybe_quiesce(period * 10); - } - - // Sleep - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { - break; + Err(err) => { + error_run += 1; + let backoff = + exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + log_compaction_error(&err, error_run, backoff, cancel.is_cancelled()); + continue; } } + + // NB: this log entry is recorded by performance tests. + debug!( + elapsed_ms = elapsed.as_millis(), + "compaction iteration complete" + ); } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } fn log_compaction_error( - e: &CompactionError, - error_run_count: u32, - sleep_duration: &std::time::Duration, + err: &CompactionError, + error_count: u32, + sleep_duration: Duration, task_cancelled: bool, ) { use crate::tenant::upload_queue::NotInitialized; use crate::tenant::PageReconstructError; use CompactionError::*; - enum LooksLike { - Info, - Error, - } + let level = match err { + ShuttingDown => return, + Offload(_) => Level::ERROR, + _ if task_cancelled => Level::INFO, + Other(err) => { + let root_cause = err.root_cause(); - let decision = match e { - ShuttingDown => None, - Offload(_) => Some(LooksLike::Error), - _ if task_cancelled => Some(LooksLike::Info), - Other(e) => { - let root_cause = e.root_cause(); - - let is_stopping = { - let upload_queue = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - let timeline = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - upload_queue || timeline - }; + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let is_stopping = upload_queue || timeline; if is_stopping { - Some(LooksLike::Info) + Level::INFO } else { - Some(LooksLike::Error) + Level::ERROR } } }; - match decision { - Some(LooksLike::Info) => info!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", - ), - Some(LooksLike::Error) => error!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", - ), - None => {} + match level { + Level::ERROR => { + error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + Level::INFO => { + info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + level => unimplemented!("unexpected level {level:?}"), } } -/// -/// GC task's main loop -/// +/// GC task's main loop. async fn gc_loop(tenant: Arc, cancel: CancellationToken) { const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + let mut error_run = 0; // consecutive errors - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - // GC might require downloading, to find the cutoff LSN that corresponds to the - // cutoff specified as time. - let ctx = - RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + // GC might require downloading, to find the cutoff LSN that corresponds to the + // cutoff specified as time. + let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; - let mut first = true; - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } - let period = tenant.get_gc_period(); + let period = tenant.get_gc_period(); - if first { - first = false; - - let delays = async { - random_init_delay(period, &cancel).await?; - Ok::<_, Cancelled>(()) - }; - - if delays.await.is_err() { - break; - } - } - - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration; - if period == Duration::ZERO || gc_horizon == 0 { - #[cfg(not(feature = "testing"))] - info!("automatic GC is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Gc, - }; - // Run gc - let IterationResult { output, elapsed: _ } = - iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) - .await; - match output { - Ok(_) => { - error_run_count = 0; - sleep_duration = period; - } - Err(crate::tenant::GcError::TenantCancelled) => { - return; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - - if matches!(e, crate::tenant::GcError::TimelineCancelled) { - // Timeline was cancelled during gc. We might either be in an event - // that affects the entire tenant (tenant deletion, pageserver shutdown), - // or in one that affects the timeline only (timeline deletion). - // Therefore, don't exit the loop. - info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } else { - error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } - - sleep_duration = wait_duration; - } - } - }; - - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { + if first { + first = false; + if sleep_random(period, &cancel).await.is_err() { break; } } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} - -async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let mut last_throttle_flag_reset_at = Instant::now(); - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } - - // We run ingest housekeeping with the same frequency as compaction: it is not worth - // having a distinct setting. But we don't run it in the same task, because compaction - // blocks on acquiring the background job semaphore. - let period = tenant.get_compaction_period(); - - // If compaction period is set to zero (to disable it), then we will use a reasonable default - let period = if period == Duration::ZERO { - humantime::Duration::from_str( - pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, - ) - .unwrap() - .into() - } else { - period - }; - - // Jitter the period by +/- 5% - let period = - rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); - - // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of - // a tenant, since it won't have started writing any ephemeral files yet. - if tokio::time::timeout(period, cancel.cancelled()) - .await - .is_ok() - { - break; - } + let gc_horizon = tenant.get_gc_horizon(); + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { + #[cfg(not(feature = "testing"))] + info!("automatic GC is disabled"); + // check again in 10 seconds, in case it's been enabled again. + sleep_duration = Duration::from_secs(10); + } else { let iteration = Iteration { started_at: Instant::now(), period, - kind: BackgroundLoopKind::IngestHouseKeeping, + kind: BackgroundLoopKind::Gc, }; - iteration.run(tenant.ingest_housekeeping()).await; - - // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. - // Or just spawn another background loop for this throttle, it's not like it's super costly. - info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { - let now = Instant::now(); - let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); - if count_throttled == 0 { + // Run gc + let IterationResult { output, elapsed: _ } = iteration + .run(tenant.gc_iteration( + None, + gc_horizon, + tenant.get_pitr_interval(), + &cancel, + &ctx, + )) + .await; + match output { + Ok(_) => { + error_run = 0; + sleep_duration = period; + } + Err(crate::tenant::GcError::TenantCancelled) => { return; } - let allowed_rps = tenant.pagestream_throttle.steady_rps(); - let delta = now - prev; - info!( - n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), - count_accounted = count_accounted_finish, // don't break existing log scraping - count_throttled, - sum_throttled_usecs, - count_accounted_start, // log after pre-existing fields to not break existing log scraping - allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds" - ); - }); - } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} + Err(e) => { + error_run += 1; + let wait_duration = + exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS); -async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { - // if the tenant has a proper status already, no need to wait for anything - if tenant.current_state() == TenantState::Active { - ControlFlow::Continue(()) - } else { - let mut tenant_state_updates = tenant.subscribe_for_state_updates(); - loop { - match tenant_state_updates.changed().await { - Ok(()) => { - let new_state = &*tenant_state_updates.borrow(); - match new_state { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => { - debug!("Not running the task loop, tenant is not active: {state:?}"); - continue; - } + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); } - } - Err(_sender_dropped_error) => { - return ControlFlow::Break(()); + + sleep_duration = wait_duration; } } + }; + + if tokio::time::timeout(sleep_duration, cancel.cancelled()) + .await + .is_ok() + { + break; + } + } +} + +/// Tenant housekeeping's main loop. +async fn tenant_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + let mut last_throttle_flag_reset_at = Instant::now(); + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Use the same period as compaction; it's not worth a separate setting. But if it's set to + // zero (to disable compaction), then use a reasonable default. Jitter it by 5%. + let period = match tenant.get_compaction_period() { + Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(), + period => period, + }; + + let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else { + break; + }; + + // Do tenant housekeeping. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::TenantHouseKeeping, + }; + iteration.run(tenant.housekeeping()).await; + + // Log any getpage throttling. + info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.pagestream_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), + count_accounted = count_accounted_finish, // don't break existing log scraping + count_throttled, + sum_throttled_usecs, + count_accounted_start, // log after pre-existing fields to not break existing log scraping + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds" + ); + }); + } +} + +/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down. +async fn wait_for_active_tenant( + tenant: &Arc, + cancel: &CancellationToken, +) -> ControlFlow<()> { + if tenant.current_state() == TenantState::Active { + return ControlFlow::Continue(()); + } + + let mut update_rx = tenant.subscribe_for_state_updates(); + loop { + tokio::select! { + _ = cancel.cancelled() => return ControlFlow::Break(()), + result = update_rx.changed() => if result.is_err() { + return ControlFlow::Break(()); + } + } + + match &*update_rx.borrow() { + TenantState::Active => { + debug!("Tenant state changed to active, continuing the task loop"); + return ControlFlow::Continue(()); + } + state => debug!("Not running the task loop, tenant is not active: {state:?}"), } } } @@ -558,26 +491,41 @@ async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { #[error("cancelled")] pub(crate) struct Cancelled; -/// Provide a random delay for background task initialization. +/// Sleeps for a random interval up to the given max value. /// /// This delay prevents a thundering herd of background tasks and will likely keep them running on /// different periods for more stable load. -pub(crate) async fn random_init_delay( - period: Duration, +pub(crate) async fn sleep_random( + max: Duration, cancel: &CancellationToken, -) -> Result<(), Cancelled> { - if period == Duration::ZERO { - return Ok(()); - } +) -> Result { + sleep_random_range(Duration::ZERO..=max, cancel).await +} - let d = { - let mut rng = rand::thread_rng(); - rng.gen_range(Duration::ZERO..=period) - }; - match tokio::time::timeout(d, cancel.cancelled()).await { - Ok(_) => Err(Cancelled), - Err(_) => Ok(()), +/// Sleeps for a random interval in the given range. Returns the duration. +pub(crate) async fn sleep_random_range( + interval: RangeInclusive, + cancel: &CancellationToken, +) -> Result { + let delay = rand::thread_rng().gen_range(interval); + if delay == Duration::ZERO { + return Ok(delay); } + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(delay) => Ok(delay), + } +} + +/// Sleeps for an interval with a random jitter. +pub(crate) async fn sleep_jitter( + duration: Duration, + jitter: Duration, + cancel: &CancellationToken, +) -> Result { + let from = duration.saturating_sub(jitter); + let to = duration.saturating_add(jitter); + sleep_random_range(from..=to, cancel).await } struct Iteration { @@ -593,42 +541,25 @@ struct IterationResult { impl Iteration { #[instrument(skip_all)] - pub(crate) async fn run(self, fut: Fut) -> IterationResult - where - Fut: std::future::Future, - { - let Self { - started_at, - period, - kind, - } = self; - - let mut fut = std::pin::pin!(fut); + pub(crate) async fn run, O>(self, fut: F) -> IterationResult { + let mut fut = pin!(fut); // Wrap `fut` into a future that logs a message every `period` so that we get a // very obvious breadcrumb in the logs _while_ a slow iteration is happening. - let liveness_logger = async move { - loop { - match tokio::time::timeout(period, &mut fut).await { - Ok(x) => return x, - Err(_) => { - // info level as per the same rationale why warn_when_period_overrun is info - // => https://github.com/neondatabase/neon/pull/5724 - info!("still running"); - } - } + let output = loop { + match tokio::time::timeout(self.period, &mut fut).await { + Ok(r) => break r, + Err(_) => info!("still running"), } }; - - let output = liveness_logger.await; - - let elapsed = started_at.elapsed(); - warn_when_period_overrun(elapsed, period, kind); + let elapsed = self.started_at.elapsed(); + warn_when_period_overrun(elapsed, self.period, self.kind); IterationResult { output, elapsed } } } -/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. + +// NB: the `task` and `period` are used for metrics labels. pub(crate) fn warn_when_period_overrun( elapsed: Duration, period: Duration, @@ -645,7 +576,7 @@ pub(crate) fn warn_when_period_overrun( ?task, "task iteration took longer than the configured period" ); - crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT .with_label_values(&[task.into(), &format!("{}", period.as_secs())]) .inc(); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 908356c459..94b4abb7e9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +mod heatmap_layers_downloader; pub(crate) mod import_pgdata; mod init; pub mod layer_manager; @@ -45,11 +46,9 @@ use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; use tokio::sync::mpsc::Sender; -use tokio::{ - runtime::Handle, - sync::{oneshot, watch}, -}; +use tokio::sync::{oneshot, watch, Notify}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::critical; @@ -119,7 +118,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL}; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate}; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -152,16 +151,15 @@ use super::{ config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, MaybeOffloaded, }; -use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; +use super::{ + debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline, +}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{ remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, storage_layer::ReadableLayer, }; -use super::{ - secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, - GcError, -}; +use super::{secondary::heatmap::HeatMapLayer, GcError}; #[cfg(test)] use pageserver_api::value::Value; @@ -227,6 +225,7 @@ pub struct TimelineResources { pub remote_client: RemoteTimelineClient, pub pagestream_throttle: Arc, pub pagestream_throttle_metrics: Arc, + pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -329,6 +328,7 @@ pub struct Timeline { // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM], directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], /// Ensures layers aren't frozen by checkpointer between @@ -353,8 +353,11 @@ pub struct Timeline { /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, - // Needed to ensure that we can't create a branch at a point that was already garbage collected - pub latest_gc_cutoff_lsn: Rcu, + // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which + // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it. + // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than + // the planned GC cutoff. + pub applied_gc_cutoff_lsn: Rcu, pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, @@ -426,6 +429,9 @@ pub struct Timeline { /// If true, the last compaction failed. compaction_failed: AtomicBool, + /// Notifies the tenant compaction loop that there is pending L0 compaction work. + l0_compaction_trigger: Arc, + /// Make sure we only have one running gc at a time. /// /// Must only be taken in two places: @@ -460,6 +466,20 @@ pub struct Timeline { /// If Some, collects GetPage metadata for an ongoing PageTrace. pub(crate) page_trace: ArcSwapOption>, + + previous_heatmap: ArcSwapOption, + + /// May host a background Tokio task which downloads all the layers from the current + /// heatmap on demand. + heatmap_layers_downloader: Mutex>, +} + +pub(crate) enum PreviousHeatmap { + Active { + heatmap: HeatMapTimeline, + read_at: std::time::Instant, + }, + Obsolete, } pub type TimelineDeleteProgress = Arc>; @@ -626,6 +646,71 @@ impl From for GetVectoredError { } } +/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes +/// only and not used by the "real read path". +pub enum ReadPathLayerId { + PersistentLayer(PersistentLayerKey), + InMemoryLayer(Range), +} + +impl std::fmt::Display for ReadPathLayerId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key), + ReadPathLayerId::InMemoryLayer(range) => { + write!(f, "in-mem {}..{}", range.start, range.end) + } + } + } +} +pub struct ReadPath { + keyspace: KeySpace, + lsn: Lsn, + path: Vec<(ReadPathLayerId, KeySpace, Range)>, +} + +impl ReadPath { + pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self { + Self { + keyspace, + lsn, + path: Vec::new(), + } + } + + pub fn record_layer_visit( + &mut self, + layer_to_read: &ReadableLayer, + keyspace_to_read: &KeySpace, + lsn_range: &Range, + ) { + let id = match layer_to_read { + ReadableLayer::PersistentLayer(layer) => { + ReadPathLayerId::PersistentLayer(layer.layer_desc().key()) + } + ReadableLayer::InMemoryLayer(layer) => { + ReadPathLayerId::InMemoryLayer(layer.get_lsn_range()) + } + }; + self.path + .push((id, keyspace_to_read.clone(), lsn_range.clone())); + } +} + +impl std::fmt::Display for ReadPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?; + for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() { + writeln!( + f, + "{}: {} {}..{} {}", + idx, layer_id, lsn_range.start, lsn_range.end, keyspace + )?; + } + Ok(()) + } +} + #[derive(thiserror::Error)] pub struct MissingKeyError { key: Key, @@ -633,6 +718,8 @@ pub struct MissingKeyError { cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, + /// Debug information about the read path if there's an error + read_path: Option, backtrace: Option, } @@ -649,10 +736,15 @@ impl std::fmt::Display for MissingKeyError { "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", self.key, self.shard, self.cont_lsn, self.request_lsn )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {}", ancestor_lsn)?; } + if let Some(ref read_path) = self.read_path { + write!(f, "\n{}", read_path)?; + } + if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -802,8 +894,12 @@ pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, ForceL0Compaction, + OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, + /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting + /// compaction via HTTP API. + NoYield, } #[serde_with::serde_as] @@ -999,9 +1095,15 @@ impl Timeline { (history, gc_info.within_ancestor_pitr) } - /// Lock and get timeline's GC cutoff - pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { - self.latest_gc_cutoff_lsn.read() + /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen + pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.applied_gc_cutoff_lsn.read() + } + + /// Read timeline's planned GC cutoff: this is the logical end of history that users + /// are allowed to read (based on configured PITR), even if physically we have more history. + pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn { + self.gc_info.read().unwrap().cutoffs.time } /// Look up given page version. @@ -1069,6 +1171,7 @@ impl Timeline { request_lsn: lsn, ancestor_lsn: None, backtrace: None, + read_path: None, })), } } @@ -1195,6 +1298,13 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { + let read_path = if self.conf.enable_read_path_debugging { + Some(ReadPath::new(keyspace.clone(), lsn)) + } else { + None + }; + reconstruct_state.read_path = read_path; + let traversal_res: Result<(), _> = self .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) .await; @@ -1471,6 +1581,7 @@ impl Timeline { let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE); let mut gc_info = self.gc_info.write().unwrap(); + let planned_cutoff = gc_info.min_cutoff(); let valid_until = SystemTime::now() + length; @@ -1491,7 +1602,7 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and + // Reject already GC-ed LSN if we are in AttachedSingle and // not blocked by the lsn lease deadline. let validate = { let conf = self.tenant_conf.load(); @@ -1500,9 +1611,12 @@ impl Timeline { }; if init || validate { - let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + if lsn < planned_cutoff { + bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff); } } @@ -1704,35 +1818,48 @@ impl Timeline { .await } - /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending - /// compaction tasks. + /// Outermost timeline compaction operation; downloads needed layers. + /// + /// NB: the cancellation token is usually from a background task, but can also come from a + /// request task. pub(crate) async fn compact_with_options( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, ) -> Result { - // most likely the cancellation token is from background task, but in tests it could be the - // request task as well. + // Acquire the compaction lock and task semaphore. + // + // L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved + // out by other background tasks (including image compaction). We request this via + // `BackgroundLoopKind::L0Compaction`. + // + // If this is a regular compaction pass, and L0-only compaction is enabled in the config, + // then we should yield for immediate L0 compaction if necessary while we're waiting for the + // background task semaphore. There's no point yielding otherwise, since we'd just end up + // right back here. + let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); + let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { + true => BackgroundLoopKind::L0Compaction, + false => BackgroundLoopKind::Compaction, + }; + let yield_for_l0 = !is_l0_only + && self.get_compaction_l0_first() + && !options.flags.contains(CompactFlags::NoYield); - let prepare = async move { + let acquire = async move { let guard = self.compaction_lock.lock().await; - - let permit = super::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Compaction, - ctx, - ) - .await; - + let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await; (guard, permit) }; - // this wait probably never needs any "long time spent" logging, because we already nag if - // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { - tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done), - _ = cancel.cancelled() => return Ok(CompactionOutcome::Done), + (guard, permit) = acquire => (guard, permit), + _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => { + return Ok(CompactionOutcome::YieldForL0); + } + _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped), + _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1740,7 +1867,7 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } let result = match self.get_compaction_algorithm_settings().kind { @@ -1917,6 +2044,11 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // If we have a background task downloading heatmap layers stop it. + // The background downloads are sensitive to timeline cancellation (done above), + // so the drain will be immediate. + self.stop_and_drain_heatmap_layers_download().await; + // Ensure Prevent new page service requests from starting. self.handles.shutdown(); @@ -2234,6 +2366,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2242,6 +2382,20 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + + pub fn get_compaction_l0_semaphore(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_semaphore + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore) + } + fn get_l0_flush_delay_threshold(&self) -> Option { // Disable L0 flushes by default. This and compaction needs further tuning. const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 @@ -2442,6 +2596,7 @@ impl Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -2528,6 +2683,7 @@ impl Timeline { ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), @@ -2542,7 +2698,7 @@ impl Timeline { LastImageLayerCreationStatus::default(), )), - latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), current_logical_size: if disk_consistent_lsn.is_valid() { @@ -2583,6 +2739,7 @@ impl Timeline { compaction_lock: tokio::sync::Mutex::default(), compaction_failed: AtomicBool::default(), + l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), standby_horizon: AtomicLsn::new(0), @@ -2603,6 +2760,10 @@ impl Timeline { create_idempotency, page_trace: Default::default(), + + previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), + + heatmap_layers_downloader: Mutex::new(None), }; result.repartition_threshold = @@ -2632,7 +2793,7 @@ impl Timeline { return; } FlushLoopState::Exited => { - warn!( + info!( "ignoring attempt to restart exited flush_loop {}/{}", self.tenant_shard_id, self.timeline_id ); @@ -3056,7 +3217,7 @@ impl Timeline { let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { - let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + let wait_for_permit = super::tasks::acquire_concurrency_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, ); @@ -3291,8 +3452,42 @@ impl Timeline { } } - pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { - self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) { + // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system + // for each of the database, but we only store one value, and therefore each pgdirmodification + // would overwrite the previous value if they modify different databases. + + match count { + MetricsUpdate::Set(count) => { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed); + } + MetricsUpdate::Add(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub + // the value reliably. + self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + MetricsUpdate::Sub(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before. + // The operation could overflow so we need to normalize the value. + let prev_val = + self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed); + let res = prev_val.saturating_sub(count); + self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + }; + + // TODO: remove this, there's no place in the code that updates this aux metrics. let aux_metric = self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); @@ -3341,12 +3536,52 @@ impl Timeline { let guard = self.layers.read().await; + // Firstly, if there's any heatmap left over from when this location + // was a secondary, take that into account. Keep layers that are: + // * present in the layer map + // * visible + // * non-resident + // * not evicted since we read the heatmap + // + // Without this, a new cold, attached location would clobber the previous + // heatamp. + let previous_heatmap = self.previous_heatmap.load(); + let visible_non_resident = match previous_heatmap.as_deref() { + Some(PreviousHeatmap::Active { heatmap, read_at }) => { + Some(heatmap.layers.iter().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; + + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } + + if layer.is_likely_resident() { + return None; + } + + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } + + Some((desc, hl.metadata.clone(), hl.access_time)) + })) + } + Some(PreviousHeatmap::Obsolete) => None, + None => None, + }; + + // Secondly, all currently visible, resident layers are included. let resident = guard.likely_resident_layers().filter_map(|layer| { match layer.visibility() { LayerVisibilityHint::Visible => { // Layer is visible to one or more read LSNs: elegible for inclusion in layer map let last_activity_ts = layer.latest_activity(); - Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) + Some(( + layer.layer_desc().clone(), + layer.metadata(), + last_activity_ts, + )) } LayerVisibilityHint::Covered => { // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. @@ -3355,7 +3590,18 @@ impl Timeline { } }); - let mut layers = resident.collect::>(); + let mut layers = match visible_non_resident { + Some(non_resident) => { + let mut non_resident = non_resident.peekable(); + if non_resident.peek().is_none() { + self.previous_heatmap + .store(Some(PreviousHeatmap::Obsolete.into())); + } + + non_resident.chain(resident).collect::>() + } + None => resident.collect::>(), + }; // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes @@ -3459,7 +3705,9 @@ impl Timeline { // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); - // Do not fire missing key error for sparse keys. + // Do not fire missing key error and end early for sparse keys. Note that we hava already removed + // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of + // figuring out what is the inherited key range and do a fine-grained pruning. removed.remove_overlapping_with(&KeySpace { ranges: vec![SPARSE_RANGE], }); @@ -3502,6 +3750,7 @@ impl Timeline { request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, + read_path: std::mem::take(&mut reconstruct_state.read_path), })); } @@ -3543,7 +3792,7 @@ impl Timeline { // the timeline, then it will remove layers that are required for fulfilling // the current get request (read-path cannot "look back" and notice the new // image layer). - let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn(); + let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn(); // See `compaction::compact_with_gc` for why we need this. let _guard = timeline.gc_compaction_layer_update_lock.read().await; @@ -3620,6 +3869,9 @@ impl Timeline { } if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + if let Some(ref mut read_path) = reconstruct_state.read_path { + read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); + } let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( @@ -3920,6 +4172,12 @@ impl Timeline { } let flush_duration = flush_timer.stop_and_record(); + // Notify the tenant compaction loop if L0 compaction is needed. + let l0_count = *watch_l0.borrow(); + if l0_count >= self.get_compaction_threshold() { + self.l0_compaction_trigger.notify_one(); + } + // Delay the next flush to backpressure if compaction can't keep up. We delay by the // flush duration such that the flush takes 2x as long. This is propagated up to WAL // ingestion by having ephemeral layer rolls wait for flushes. @@ -4092,6 +4350,7 @@ impl Timeline { ImageLayerCreationMode::Initial, ctx, LastImageLayerCreationStatus::Initial, + false, // don't yield for L0, we're flushing L0 ) .await?; debug_assert!( @@ -4220,7 +4479,7 @@ impl Timeline { let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - *self.latest_gc_cutoff_lsn.read(), + *self.applied_gc_cutoff_lsn.read(), ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -4664,6 +4923,7 @@ impl Timeline { mode: ImageLayerCreationMode, ctx: &RequestContext, last_status: LastImageLayerCreationStatus, + yield_for_l0: bool, ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); @@ -4860,7 +5120,7 @@ impl Timeline { if let ImageLayerCreationMode::Try = mode { // We have at least made some progress - if batch_image_writer.pending_layer_num() >= 1 { + if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 { // The `Try` mode is currently only used on the compaction path. We want to avoid // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation @@ -5447,7 +5707,7 @@ impl Timeline { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { - time: *self.get_latest_gc_cutoff_lsn(), + time: *self.get_applied_gc_cutoff_lsn(), space: space_cutoff, } } @@ -5568,7 +5828,7 @@ impl Timeline { let mut result: GcResult = GcResult::default(); // Nothing to GC. Return early. - let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", @@ -5582,7 +5842,7 @@ impl Timeline { // // The GC cutoff should only ever move forwards. let waitlist = { - let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); + let write_guard = self.applied_gc_cutoff_lsn.lock_for_write(); if *write_guard > new_gc_cutoff { return Err(GcError::BadLsn { why: format!( @@ -6522,18 +6782,32 @@ fn is_send() { #[cfg(test)] mod tests { + use std::sync::Arc; + use pageserver_api::key::Key; use pageserver_api::value::Value; + use tracing::Instrument; use utils::{id::TimelineId, lsn::Lsn}; use crate::tenant::{ harness::{test_img, TenantHarness}, layer_map::LayerMap, - storage_layer::{Layer, LayerName}, + storage_layer::{Layer, LayerName, LayerVisibilityHint}, timeline::{DeltaLayerTestDesc, EvictionError}, - Timeline, + PreviousHeatmap, Timeline, }; + use super::HeatMapTimeline; + + fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { + assert_eq!(lhs.layers.len(), rhs.layers.len()); + let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + for (l, r) in lhs_rhs { + assert_eq!(l.name, r.name); + assert_eq!(l.metadata, r.metadata); + } + } + #[tokio::test] async fn test_heatmap_generation() { let harness = TenantHarness::create("heatmap_generation").await.unwrap(); @@ -6607,7 +6881,7 @@ mod tests { assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in heatmap.layers { + for layer in &heatmap.layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -6622,6 +6896,144 @@ mod tests { last_lsn = layer_lsn; } } + + // Evict all the layers and stash the old heatmap in the timeline. + // This simulates a migration to a cold secondary location. + + let guard = timeline.layers.read().await; + let mut all_layers = Vec::new(); + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + all_layers.push(layer.clone()); + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Generate a new heatmap and assert that it contains the same layers as the old one. + let post_migration_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap); + + // Download each layer one by one. Generate the heatmap at each step and check + // that it's stable. + for layer in all_layers { + if layer.visibility() == LayerVisibilityHint::Covered { + continue; + } + + eprintln!("Downloading {layer} and re-generating heatmap"); + + let _resident = layer + .download_and_keep_resident() + .instrument(tracing::info_span!( + parent: None, + "download_layer", + tenant_id = %timeline.tenant_shard_id.tenant_id, + shard_id = %timeline.tenant_shard_id.shard_slug(), + timeline_id = %timeline.timeline_id + )) + .await + .unwrap(); + + let post_download_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap); + } + + // Everything from the post-migration heatmap is now resident. + // Check that we drop it from memory. + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); + } + + #[tokio::test] + async fn test_previous_heatmap_obsoletion() { + let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion") + .await + .unwrap(); + + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + + let delta_layers = vec![l0_delta]; + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + // Both layers should be in the heatmap + assert!(!heatmap.layers.is_empty()); + + // Now simulate a migration. + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Evict all the layers in the previous heatmap + let guard = timeline.layers.read().await; + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + // Generate a new heatmap and check that the previous heatmap + // has been marked obsolete. + let post_eviction_heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert!(post_eviction_heatmap.layers.is_empty()); + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); } #[tokio::test] diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index b9f4954453..9e082d74b5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -301,18 +301,12 @@ impl GcCompactionQueue { let mut guard = self.inner.lock().unwrap(); guard.gc_guards.insert(id, gc_guard); } - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; self.notify_and_unblock(id); } } GcCompactionQueueItem::SubCompactionJob(options) => { - let _ = timeline - .compact_with_options(cancel, options, ctx) - .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) - .await?; + let _ = timeline.compact_with_options(cancel, options, ctx).await?; } GcCompactionQueueItem::Notify(id) => { self.notify_and_unblock(id); @@ -609,6 +603,11 @@ pub enum CompactionOutcome { /// Still has pending layers to be compacted after this round. Ideally, the scheduler /// should immediately schedule another compaction. Pending, + /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only + /// guaranteed when `compaction_l0_first` is enabled). + YieldForL0, + /// Compaction was skipped, because the timeline is ineligible for compaction. + Skipped, } impl Timeline { @@ -688,9 +687,9 @@ impl Timeline { // Define partitioning schema if needed // 1. L0 Compact - let l0_compaction_outcome = { + let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); - let l0_compaction_outcome = self + let l0_outcome = self .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), @@ -698,21 +697,25 @@ impl Timeline { ) .await?; timer.stop_and_record(); - l0_compaction_outcome + l0_outcome }; - if let CompactionOutcome::Pending = l0_compaction_outcome { - // Yield and do not do any other kind of compaction. True means - // that we have pending L0 compaction tasks and the compaction scheduler - // will prioritize compacting this tenant/timeline again. - info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers."); - return Ok(CompactionOutcome::Pending); + if options.flags.contains(CompactFlags::OnlyL0Compaction) { + return Ok(l0_outcome); + } + + // Yield if we have pending L0 compaction. The scheduler will do another pass. + if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) + && !options.flags.contains(CompactFlags::NoYield) + { + info!("image/ancestor compaction yielding for L0 compaction"); + return Ok(CompactionOutcome::YieldForL0); } // 2. Repartition and create image layers if necessary - let partition_count = match self + match self .repartition( - self.get_last_record_lsn(), // TODO: use L0-L1 boundary + self.get_last_record_lsn(), self.get_compaction_target_size(), options.flags, ctx, @@ -748,6 +751,7 @@ impl Timeline { .load() .as_ref() .clone(), + !options.flags.contains(CompactFlags::NoYield), ) .await .inspect_err(|err| { @@ -766,9 +770,8 @@ impl Timeline { if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { // Yield and do not do any other kind of compaction. info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); - return Ok(CompactionOutcome::Pending); + return Ok(CompactionOutcome::YieldForL0); } - partitioning.parts.len() } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -780,10 +783,11 @@ impl Timeline { if !self.cancel.is_cancelled() && !err.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } - 1 } }; + let partition_count = self.partitioning.read().0 .0.parts.len(); + // 4. Shard ancestor compaction if self.shard_identity.count >= ShardCount::new(2) { @@ -820,7 +824,7 @@ impl Timeline { // // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. - let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", @@ -2170,7 +2174,7 @@ impl Timeline { // TODO: ensure the child branches will not use anything below the watermark, or consider // them when computing the watermark. - gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn()) + gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn()) } /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job. @@ -2238,8 +2242,11 @@ impl Timeline { split_key_ranges.push((start, end)); } split_key_ranges.sort(); - let guard = self.layers.read().await; - let layer_map = guard.layer_map()?; + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; let mut current_start = None; let ranges_num = split_key_ranges.len(); for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() { @@ -2251,14 +2258,23 @@ impl Timeline { // We have already processed this partition. continue; } - let res = layer_map.range_search(start..end, compact_below_lsn); - let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::(); + let overlapping_layers = { + let mut desc = Vec::new(); + for layer in all_layers.iter() { + if overlaps_with(&layer.get_key_range(), &(start..end)) + && layer.get_lsn_range().start <= compact_below_lsn + { + desc.push(layer.clone()); + } + } + desc + }; + let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::(); if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 { // Try to extend the compaction range so that we include at least one full layer file. - let extended_end = res - .found - .keys() - .map(|layer| layer.layer.key_range.end) + let extended_end = overlapping_layers + .iter() + .map(|layer| layer.key_range.end) .min(); // It is possible that the search range does not contain any layer files when we reach the end of the loop. // In this case, we simply use the specified key range end. @@ -2285,7 +2301,6 @@ impl Timeline { current_start = Some(end); } } - drop(guard); Ok(compact_jobs) } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 3c828c8a9e..841b2fa1c7 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -17,13 +17,11 @@ use crate::{ metadata::TimelineMetadata, remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, TimelineOrOffloaded, + TenantManifestError, Timeline, TimelineOrOffloaded, }, virtual_file::MaybeFatalIo, }; -use super::{Timeline, TimelineResources}; - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. @@ -296,12 +294,8 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. - TimelineResources { - remote_client, - pagestream_throttle: tenant.pagestream_throttle.clone(), - pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(), - l0_flush_global_state: tenant.l0_flush_global_state.clone(), - }, + None, // Previous heatmap is not needed for deletion + tenant.get_timeline_resources_for(remote_client), // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, @@ -341,6 +335,13 @@ impl DeleteTimelineFlow { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); + // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest. + let Ok(tenant_guard) = tenant.gate.enter() else { + // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion. + info!("Tenant is shutting down, timeline deletion will be resumed when it next starts"); + return; + }; + task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, @@ -348,6 +349,8 @@ impl DeleteTimelineFlow { Some(timeline_id), "timeline_delete", async move { + let _guard = tenant_guard; + if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { // Only log as an error if it's not a cancellation. if matches!(err, DeleteTimelineError::Cancelled) { diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index f8bc4352e2..e0084d3eef 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -6,17 +6,20 @@ use crate::{ task_mgr::TaskKind, tenant::{ remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + storage_layer::{ + layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer, + }, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; use anyhow::Context; +use http_utils::error::ApiError; use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; +use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -351,18 +354,7 @@ pub(super) async fn prepare( // FIXME: the fsync should be mandatory, after both rewrites and copies if wrote_any { - let timeline_dir = VirtualFile::open( - &detached - .conf - .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), - ctx, - ) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); + fsync_timeline_dir(detached, ctx).await; } } @@ -376,7 +368,7 @@ pub(super) async fn prepare( tasks.spawn( async move { let _permit = limiter.acquire().await; - let owned = remote_copy( + let (owned, did_hardlink) = remote_copy( &adopted, &timeline, timeline.generation, @@ -384,16 +376,20 @@ pub(super) async fn prepare( &timeline.cancel, ) .await?; - tracing::info!(layer=%owned, "remote copied"); - Ok(owned) + tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied"); + Ok((owned, did_hardlink)) } .in_current_span(), ); } + let mut should_fsync = false; while let Some(res) = tasks.join_next().await { match res { - Ok(Ok(owned)) => { + Ok(Ok((owned, did_hardlink))) => { + if did_hardlink { + should_fsync = true; + } new_layers.push(owned); } Ok(Err(failed)) => { @@ -403,7 +399,10 @@ pub(super) async fn prepare( } } - // TODO: fsync directory again if we hardlinked something + // fsync directory again if we hardlinked something + if should_fsync { + fsync_timeline_dir(detached, ctx).await; + } let prepared = PreparedTimelineDetach { layers: new_layers }; @@ -629,35 +628,52 @@ async fn copy_lsn_prefix( } } -/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote -/// storage on successful return without the adopted layer being added to `index_part.json`. +/// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote +/// storage on successful return. without the adopted layer being added to `index_part.json`. +/// Returns (Layer, did hardlink) async fn remote_copy( adopted: &Layer, adoptee: &Arc, generation: Generation, shard_identity: ShardIdentity, cancel: &CancellationToken, -) -> Result { - // depending if Layer::keep_resident we could hardlink - +) -> Result<(Layer, bool), Error> { let mut metadata = adopted.metadata(); debug_assert!(metadata.generation <= generation); metadata.generation = generation; metadata.shard = shard_identity.shard_index(); - let owned = crate::tenant::storage_layer::Layer::for_evicted( - adoptee.conf, - adoptee, - adopted.layer_desc().layer_name(), - metadata, - ); + let conf = adoptee.conf; + let file_name = adopted.layer_desc().layer_name(); - adoptee + // depending if Layer::keep_resident, do a hardlink + let did_hardlink; + let owned = if let Some(adopted_resident) = adopted.keep_resident().await { + let adopted_path = adopted_resident.local_path(); + let adoptee_path = local_layer_path( + conf, + &adoptee.tenant_shard_id, + &adoptee.timeline_id, + &file_name, + &metadata.generation, + ); + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + did_hardlink = true; + Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() + } else { + did_hardlink = false; + Layer::for_evicted(conf, adoptee, file_name, metadata) + }; + + let layer = adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare)) + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok((layer, did_hardlink)) } pub(crate) enum DetachingAndReparenting { @@ -1001,3 +1017,16 @@ fn check_no_archived_children_of_ancestor( } Ok(()) } + +async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) { + let path = &timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id); + let timeline_dir = VirtualFile::open(&path, ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 9836aafecb..77c33349e0 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -32,7 +32,7 @@ use crate::{ tenant::{ size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint, - tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit}, + tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit}, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -83,8 +83,6 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, tenant: Arc) { - use crate::tenant::tasks::random_init_delay; - // acquire the gate guard only once within a useful span let Ok(guard) = self.gate.enter() else { return; @@ -97,7 +95,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &self.cancel).await.is_err() { + if sleep_random(period, &self.cancel).await.is_err() { return; } } @@ -334,10 +332,8 @@ impl Timeline { cancel: &CancellationToken, ctx: &RequestContext, ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> { - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + let acquire_permit = + crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx); tokio::select! { permit = acquire_permit => ControlFlow::Continue(permit), diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs new file mode 100644 index 0000000000..0ba9753e85 --- /dev/null +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -0,0 +1,162 @@ +//! Timeline utility module to hydrate everything from the current heatmap. +//! +//! Provides utilities to spawn and abort a background task where the downloads happen. +//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. + +use futures::StreamExt; +use http_utils::error::ApiError; +use std::sync::{Arc, Mutex}; +use tokio_util::sync::CancellationToken; +use utils::sync::gate::Gate; + +use super::Timeline; + +// This status is not strictly necessary now, but gives us a nice place +// to store progress information if we ever wish to expose it. +pub(super) enum HeatmapLayersDownloadStatus { + InProgress, + Complete, +} + +pub(super) struct HeatmapLayersDownloader { + handle: tokio::task::JoinHandle<()>, + status: Arc>, + cancel: CancellationToken, + downloads_guard: Arc, +} + +impl HeatmapLayersDownloader { + fn new( + timeline: Arc, + concurrency: usize, + ) -> Result { + let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; + + let cancel = timeline.cancel.child_token(); + let downloads_guard = Arc::new(Gate::default()); + + let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress)); + + let handle = tokio::task::spawn({ + let status = status.clone(); + let downloads_guard = downloads_guard.clone(); + let cancel = cancel.clone(); + + async move { + let _guard = tl_guard; + + scopeguard::defer! { + *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete; + } + + let Some(heatmap) = timeline.generate_heatmap().await else { + tracing::info!("Heatmap layers download failed to generate heatmap"); + return; + }; + + tracing::info!( + resident_size=%timeline.resident_physical_size(), + heatmap_layers=%heatmap.layers.len(), + "Starting heatmap layers download" + ); + + let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + |layer| { + let tl = timeline.clone(); + let dl_guard = match downloads_guard.enter() { + Ok(g) => g, + Err(_) => { + // [`Self::shutdown`] was called. Don't spawn any more downloads. + return None; + } + }; + + Some(async move { + let _dl_guard = dl_guard; + + let res = tl.download_layer(&layer.name).await; + if let Err(err) = res { + if !err.is_cancelled() { + tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") + } + } + }) + } + )).buffered(concurrency); + + tokio::select! { + _ = stream.collect::<()>() => { + tracing::info!( + resident_size=%timeline.resident_physical_size(), + "Heatmap layers download completed" + ); + }, + _ = cancel.cancelled() => { + tracing::info!("Heatmap layers download cancelled"); + } + } + } + }); + + Ok(Self { + status, + handle, + cancel, + downloads_guard, + }) + } + + fn is_complete(&self) -> bool { + matches!( + *self.status.lock().unwrap(), + HeatmapLayersDownloadStatus::Complete + ) + } + + /// Drive any in-progress downloads to completion and stop spawning any new ones. + /// + /// This has two callers and they behave differently + /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves + /// are sensitive to timeline cancellation. + /// + /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress + /// downloads to complete. + async fn stop_and_drain(self) { + // Counterintuitive: close the guard before cancelling. + // Something needs to poll the already created download futures to completion. + // If we cancel first, then the underlying task exits and we lost + // the poller. + self.downloads_guard.close().await; + self.cancel.cancel(); + if let Err(err) = self.handle.await { + tracing::warn!("Failed to join heatmap layer downloader task: {err}"); + } + } +} + +impl Timeline { + pub(crate) async fn start_heatmap_layers_download( + self: &Arc, + concurrency: usize, + ) -> Result<(), ApiError> { + let mut locked = self.heatmap_layers_downloader.lock().unwrap(); + if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { + let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + *locked = Some(dl); + Ok(()) + } else { + Err(ApiError::Conflict("Already running".to_string())) + } + } + + pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) { + // This can race with the start of a new downloader and lead to a situation + // where one donloader is shutting down and another one is in-flight. + // The only impact is that we'd end up using more remote storage semaphore + // units than expected. + let downloader = self.heatmap_layers_downloader.lock().unwrap().take(); + if let Some(dl) = downloader { + dl.stop_and_drain().await; + } + } +} diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3b5bf8290c..93e5a1100d 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -7,7 +7,9 @@ use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; -use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; +use crate::tenant::{ + DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, +}; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { @@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let (timeline, guard) = make_timeline_delete_guard( + let delete_guard_res = make_timeline_delete_guard( tenant, timeline.timeline_id, TimelineDeleteGuardKind::Offload, - ) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + ); + if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); + return Err(OffloadError::NotArchived); + }; + let (timeline, guard) = + delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 9d539198c7..c966ad813f 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { /// bad storage or bad configuration, and we can't fix that from inside /// a running process. pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { - tracing::error!("Fatal I/O error: {e}: {context})"); + let backtrace = std::backtrace::Backtrace::force_capture(); + tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}"); std::process::abort(); } @@ -947,13 +948,18 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { - let file_guard = match self.lock_file().await { + let file_guard = match self + .lock_file() + .await + .maybe_fatal_err("lock_file inside VirtualFileInner::read_at") + { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), }; observe_duration!(StorageIoOperation::Read, { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { STORAGE_IO_SIZE .with_label_values(&[ diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index e38af08f89..0331f961b4 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,10 +14,12 @@ #include "utils/guc.h" -#include "extension_server.h" +#include "extension_server.h" #include "neon_utils.h" static int extension_server_port = 0; +static int extension_server_request_timeout = 60; +static int extension_server_connect_timeout = 60; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - static CURL *handle = NULL; - CURLcode res; - char *compute_ctl_url; bool ret = false; + CURL *handle = NULL; + char *compute_ctl_url; - if (handle == NULL) - { - handle = alloc_curl_handle(); + handle = alloc_curl_handle(); - curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - } + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + if (extension_server_request_timeout > 0) + curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ ); + if (extension_server_connect_timeout > 0) + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ ); compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", extension_server_port, filename, is_library ? "?is_library=true" : ""); @@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library) /* Perform the request, res will get the return code */ res = curl_easy_perform(handle); + curl_easy_cleanup(handle); + /* Check for errors */ if (res == CURLE_OK) { @@ -88,6 +91,24 @@ pg_init_extension_server() 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.extension_server_request_timeout", + "timeout for fetching extensions in seconds", + NULL, + &extension_server_request_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.extension_server_connect_timeout", + "timeout for connecting to the extension server in seconds", + NULL, + &extension_server_connect_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + /* set download_extension_file_hook */ prev_download_extension_file_hook = download_extension_file_hook; download_extension_file_hook = neon_download_extension_file_http; diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 1f53c8fd36..bbaad09f5f 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash) index = hash >> HLL_C_BITS; /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ - count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; + Assert(count <= HLL_C_BITS); cState->regs[index][count] = now; } @@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since) { if (reg[i] >= since) { - max = i; + max = i + 1; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 22aeb2e2d6..fc1aecd340 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel) { case PS_Disconnected: { - const char *keywords[3]; - const char *values[3]; + const char *keywords[4]; + const char *values[4]; + char pid_str[16]; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; @@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel) * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ - keywords[0] = "dbname"; - values[0] = connstr; - n_pgsql_params = 1; + n_pgsql_params = 0; + + /* + * Pageserver logs include this in the connection's tracing span. + * This allows for reasier log correlation between compute and pageserver. + */ + keywords[n_pgsql_params] = "application_name"; + { + int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid); + if (ret < 0 || ret >= (int)(sizeof(pid_str))) + elog(FATAL, "stack-allocated buffer too small to hold pid"); + } + /* lifetime: PQconnectStartParams strdups internally */ + values[n_pgsql_params] = (const char*) pid_str; + n_pgsql_params++; + + keywords[n_pgsql_params] = "dbname"; + values[n_pgsql_params] = connstr; + n_pgsql_params++; if (neon_auth_token) { - keywords[1] = "password"; - values[1] = neon_auth_token; + keywords[n_pgsql_params] = "password"; + values[n_pgsql_params] = neon_auth_token; n_pgsql_params++; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8051970176..f1087a8ccb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode) * neon_truncate() -- Truncate relation to specified number of blocks. */ static void -neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { XLogRecPtr lsn; @@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); return; default: @@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); #endif } diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index a45e8f5c4a..74cd5ac601 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); #if PG_MAJORVERSION_NUM >= 17 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); @@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum) * inmem_truncate() -- Truncate relation to specified number of blocks. */ static void -inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { } diff --git a/poetry.lock b/poetry.lock index c471d3e69c..d66c3aae7a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -412,6 +412,7 @@ files = [ [package.dependencies] botocore-stubs = "*" +mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""} mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""} types-s3transfer = "*" typing-extensions = ">=4.1.0" @@ -1030,52 +1031,56 @@ files = [ [[package]] name = "cryptography" -version = "43.0.1" +version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false -python-versions = ">=3.7" +python-versions = "!=3.9.0,!=3.9.1,>=3.7" groups = ["main"] files = [ - {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, - {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, - {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, - {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, - {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, - {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, - {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, + {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, + {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, + {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, + {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, + {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, + {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, + {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, ] [package.dependencies] cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] -nox = ["nox"] -pep8test = ["check-sdist", "click", "mypy", "ruff"] -sdist = ["build"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"] +docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] +pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] @@ -2018,6 +2023,18 @@ install-types = ["pip"] mypyc = ["setuptools (>=50)"] reports = ["lxml"] +[[package]] +name = "mypy-boto3-kms" +version = "1.26.147" +description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"}, + {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"}, +] + [[package]] name = "mypy-boto3-s3" version = "1.26.0.post1" @@ -2754,18 +2771,18 @@ pytest = ">=5,<8" [[package]] name = "pytest-timeout" -version = "2.1.0" +version = "2.3.1" description = "pytest plugin to abort hanging tests" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" groups = ["main"] files = [ - {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, - {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, + {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"}, + {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"}, ] [package.dependencies] -pytest = ">=5.0.0" +pytest = ">=7.0.0" [[package]] name = "pytest-xdist" @@ -3803,4 +3820,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7" +content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d7880ea7b9..6a381bf094 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -19,7 +19,6 @@ aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true base64.workspace = true -boxcar = "0.2.8" bstr.workspace = true bytes = { workspace = true, features = ["serde"] } camino.workspace = true @@ -37,6 +36,7 @@ hex.workspace = true hmac.workspace = true hostname.workspace = true http.workspace = true +http-utils.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper0.workspace = true @@ -62,7 +62,6 @@ postgres_backend.workspace = true postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true -prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } @@ -80,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true -strum.workspace = true strum_macros.workspace = true subtle.workspace = true thiserror.workspace = true @@ -94,7 +92,6 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true tracing-log.workspace = true -tracing-serde.workspace = true tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true diff --git a/proxy/README.md b/proxy/README.md index ecd54fbbd8..1156bfd352 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation If both postgres and proxy are running you may send a SQL query: ```console -curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ - -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \ +curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \ + -H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \ -H 'Content-Type: application/json' \ --data '{ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", @@ -104,7 +104,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie ## Test proxy locally -Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`. +Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`. We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows: ```sh @@ -125,7 +125,7 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPER Let's create self-signed certificate by running: ```sh -openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build" ``` Then we need to build proxy with 'testing' feature and run, e.g.: @@ -136,5 +136,5 @@ RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backe Now from client you can start a new session: ```sh -PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full" +PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 9be29c38c9..7503b4eac9 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -140,9 +140,8 @@ async fn authenticate( let (psql_session_id, waiter) = loop { let psql_session_id = new_psql_session_id(); - match control_plane::mgmt::get_waiter(&psql_session_id) { - Ok(waiter) => break (psql_session_id, waiter), - Err(_e) => continue, + if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) { + break (psql_session_id, waiter); } }; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index e05a693cee..5d032c0deb 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -220,11 +220,11 @@ async fn fetch_jwks( } impl JwkCacheEntryLock { - async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + async fn acquire_permit(self: &Arc) -> JwkRenewalPermit<'_> { JwkRenewalPermit::acquire_permit(self).await } - fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + fn try_acquire_permit(self: &Arc) -> Option> { JwkRenewalPermit::try_acquire_permit(self) } @@ -393,7 +393,7 @@ impl JwkCacheEntryLock { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => return Err(JwtError::UnsupportedKeyType(key.into())), - }; + } tracing::debug!(?payload, "JWT signature valid with claims"); @@ -510,7 +510,7 @@ fn verify_rsa_signature( key.verify(data, &sig)?; } _ => return Err(JwtError::InvalidRsaSigningAlgorithm), - }; + } Ok(()) } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7ef096207a..dc595844c5 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -108,6 +108,10 @@ impl Backend<'_, T> { Self::Local(_) => panic!("Local backend has no API"), } } + + pub(crate) fn is_local_proxy(&self) -> bool { + matches!(self, Self::Local(_)) + } } impl<'a, T> Backend<'a, T> { diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 7a855bf54b..8f225dc1e0 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,416 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::{bail, ensure, Context}; -use camino::{Utf8Path, Utf8PathBuf}; -use compute_api::spec::LocalProxySpec; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; -use proxy::auth::{self}; -use proxy::cancellation::CancellationHandler; -use proxy::config::{ - self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, -}; -use proxy::control_plane::locks::ApiLocks; -use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; -use proxy::http::health_server::AppMetrics; -use proxy::intern::RoleNameInt; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::rate_limiter::{ - BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, -}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::{self, GlobalConnPoolOptions}; -use proxy::tls::client_config::compute_client_config_with_root_certs; -use proxy::types::RoleName; -use proxy::url::ApiUrl; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::Parser; -use thiserror::Error; -use tokio::net::TcpListener; -use tokio::sync::Notify; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use utils::sentry_init::init_sentry; -use utils::{pid_file, project_build_tag, project_git_version}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct LocalProxyCliArgs { - /// listen for incoming metrics connections on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - metrics: String, - /// listen for incoming http connections on ip:port - #[clap(long)] - http: String, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// User rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - user_rps_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Address of the postgres server - #[clap(long, default_value = "127.0.0.1:5432")] - postgres: SocketAddr, - /// Address of the compute-ctl api service - #[clap(long, default_value = "http://127.0.0.1:3080/")] - compute_ctl: ApiUrl, - /// Path of the local proxy config file - #[clap(long, default_value = "./local_proxy.json")] - config_path: Utf8PathBuf, - /// Path of the local proxy PID file - #[clap(long, default_value = "./local_proxy.pid")] - pid_path: Utf8PathBuf, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 200)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - #[clap(long, default_value_t = 100)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 16)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init_local_proxy()?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - // TODO: refactor these to use labels - debug!("Version: {GIT_VERSION}"); - debug!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = LocalProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - // before we bind to any ports, write the process ID to a file - // so that compute-ctl can find our process later - // in order to trigger the appropriate SIGHUP on config change. - // - // This also claims a "lock" that makes sure only one instance - // of local_proxy runs at a time. - let _process_guard = loop { - match pid_file::claim_for_current_process(&args.pid_path) { - Ok(guard) => break guard, - Err(e) => { - // compute-ctl might have tried to read the pid-file to let us - // know about some config change. We should try again. - error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - }; - - let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; - let http_listener = TcpListener::bind(args.http).await?; - let shutdown = CancellationToken::new(); - - // todo: should scale with CU - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { - rps: 10.0, - max: 100.0, - }, - 16, - )); - - let mut maintenance_tasks = JoinSet::new(); - - let refresh_config_notify = Arc::new(Notify::new()); - maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), { - let refresh_config_notify = Arc::clone(&refresh_config_notify); - move || { - refresh_config_notify.notify_one(); - } - })); - - // trigger the first config load **after** setting up the signal hook - // to avoid the race condition where: - // 1. No config file registered when local_proxy starts up - // 2. The config file is written but the signal hook is not yet received - // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. - refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); - - maintenance_tasks.spawn(proxy::http::health_server::task_main( - metrics_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - - let task = serverless::task_main( - config, - auth_backend, - http_listener, - shutdown.clone(), - Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), - endpoint_rate_limiter, - ); - - match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {}, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((res, _)) => res?, - } - - Ok(()) -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: false, - pool_options: GlobalConnPoolOptions { - gc_epoch: Duration::from_secs(60), - pool_shards: 2, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: false, - - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - - let compute_config = ComputeConfig { - retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, - tls: Arc::new(compute_client_config_with_root_certs()?), - timeout: Duration::from_secs(2), - }; - - Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, - metric_collection: None, - http_config, - authentication_config: AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool: ThreadPool::new(0), - scram_protocol_timeout: Duration::from_secs(10), - rate_limiter_enabled: false, - rate_limiter: BucketRateLimiter::new(vec![]), - rate_limit_ip_subnet: 64, - ip_allowlist_check_enabled: true, - is_vpc_acccess_proxy: false, - is_auth_broker: false, - accept_jwts: true, - console_redirect_confirmation_timeout: Duration::ZERO, - }, - proxy_protocol_v2: config::ProxyProtocolV2::Rejected, - handshake_timeout: Duration::from_secs(10), - region: "local".into(), - wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, - connect_compute_locks, - connect_to_compute: compute_config, - }))) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, ()>> { - let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.postgres, args.compute_ctl.clone()), - )); - - Ok(Box::leak(Box::new(auth_backend))) -} - -#[derive(Error, Debug)] -enum RefreshConfigError { - #[error(transparent)] - Read(#[from] std::io::Error), - #[error(transparent)] - Parse(#[from] serde_json::Error), - #[error(transparent)] - Validate(anyhow::Error), -} - -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { - let mut init = true; - loop { - rx.notified().await; - - match refresh_config_inner(&path).await { - Ok(()) => {} - // don't log for file not found errors if this is the first time we are checking - // for computes that don't use local_proxy, this is not an error. - Err(RefreshConfigError::Read(e)) - if init && e.kind() == std::io::ErrorKind::NotFound => - { - debug!(error=?e, ?path, "could not read config file"); - } - Err(e) => { - error!(error=?e, ?path, "could not read config file"); - } - } - - init = false; - } -} - -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { - let bytes = tokio::fs::read(&path).await?; - let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - - let mut jwks_set = vec![]; - - fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { - let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - - ensure!( - jwks_url.has_authority() - && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); - - ensure!( - jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); - - // clear username, password and ports - jwks_url - .set_username("") - .expect("url can be a base and has a valid host and is not a file. should not error"); - jwks_url - .set_password(None) - .expect("url can be a base and has a valid host and is not a file. should not error"); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks_url.set_fragment(None); - jwks_url.query_pairs_mut().clear().finish(); - - if jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - Ok(JwksSettings { - id: jwks.id, - jwks_url, - provider_name: jwks.provider_name, - jwt_audience: jwks.jwt_audience, - role_names: jwks - .role_names - .into_iter() - .map(RoleName::from) - .map(|s| RoleNameInt::from(&s)) - .collect(), - }) - } - - for jwks in data.jwks.into_iter().flatten() { - jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); - } - - info!("successfully loaded new config"); - JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); - - Ok(()) + proxy::binary::local_proxy::run().await } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 97d870a83a..0c3326af85 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -1,299 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. -use std::{net::SocketAddr, sync::Arc}; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::future::Either; -use futures::TryFutureExt; -use itertools::Itertools; -use proxy::context::RequestContext; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::protocol2::ConnectionInfo; -use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use proxy::stream::{PqStream, Stream}; -use proxy::tls::TlsServerEndPoint; -use rustls::crypto::ring; -use rustls::pki_types::PrivateKeyDer; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpListener; -use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; -use utils::project_git_version; -use utils::sentry_init::init_sentry; - -project_git_version!(GIT_VERSION); - -fn cli() -> clap::Command { - clap::Command::new("Neon proxy/router") - .version(GIT_VERSION) - .arg( - Arg::new("listen") - .short('l') - .long("listen") - .help("listen for incoming client connections on ip:port") - .default_value("127.0.0.1:4432"), - ) - .arg( - Arg::new("tls-key") - .short('k') - .long("tls-key") - .help("path to TLS key for client postgres connections") - .required(true), - ) - .arg( - Arg::new("tls-cert") - .short('c') - .long("tls-cert") - .help("path to TLS cert for client postgres connections") - .required(true), - ) - .arg( - Arg::new("dest") - .short('d') - .long("destination") - .help("append this domain zone to the SNI hostname to get the destination address") - .required(true), - ) -} +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - let args = cli().get_matches(); - let destination: String = args.get_one::("dest").unwrap().parse()?; - - // Configure TLS - let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( - args.get_one::("tls-key"), - args.get_one::("tls-cert"), - ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } - _ => bail!("tls-key and tls-cert must be specified"), - }; - - // Start listening for incoming client connections - let proxy_address: SocketAddr = args.get_one::("listen").unwrap().parse()?; - info!("Starting sni router on {proxy_address}"); - let proxy_listener = TcpListener::bind(proxy_address).await?; - - let cancellation_token = CancellationToken::new(); - - let main = tokio::spawn(task_main( - Arc::new(destination), - tls_config, - tls_server_end_point, - proxy_listener, - cancellation_token.clone(), - )); - let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {})); - - // the signal task cant ever succeed. - // the main task can error, or can succeed on cancellation. - // we want to immediately exit on either of these cases - let signal = match futures::future::select(signals_task, main).await { - Either::Left((res, _)) => proxy::error::flatten_err(res)?, - Either::Right((res, _)) => return proxy::error::flatten_err(res), - }; - - // maintenance tasks return `Infallible` success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match signal {} -} - -async fn task_main( - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - listener: tokio::net::TcpListener, - cancellation_token: CancellationToken, -) -> anyhow::Result<()> { - // When set for the server socket, the keepalive setting - // will be inherited by all accepted client sockets. - socket2::SockRef::from(&listener).set_keepalive(true)?; - - let connections = tokio_util::task::task_tracker::TaskTracker::new(); - - while let Some(accept_result) = - run_until_cancelled(listener.accept(), &cancellation_token).await - { - let (socket, peer_addr) = accept_result?; - - let session_id = uuid::Uuid::new_v4(); - let tls_config = Arc::clone(&tls_config); - let dest_suffix = Arc::clone(&dest_suffix); - - connections.spawn( - async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; - - info!(%peer_addr, "serving"); - let ctx = RequestContext::new( - session_id, - ConnectionInfo { - addr: peer_addr, - extra: None, - }, - proxy::metrics::Protocol::SniRouter, - "sni", - ); - handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await - } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }) - .instrument(tracing::info_span!("handle_client", ?session_id)), - ); - } - - connections.close(); - drop(listener); - - connections.wait().await; - - info!("all client connections have finished"); - Ok(()) -} - -const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; - -async fn ssl_handshake( - ctx: &RequestContext, - raw_stream: S, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, -) -> anyhow::Result> { - let mut stream = PqStream::new(Stream::from_raw(raw_stream)); - - let msg = stream.read_startup_packet().await?; - use pq_proto::FeStartupPacket::*; - - match msg { - SslRequest { direct: false } => { - stream - .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) - .await?; - - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empty. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - - Ok(Stream::Tls { - tls: Box::new( - raw.upgrade(tls_config, !ctx.has_private_peer_addr()) - .await?, - ), - tls_server_end_point, - }) - } - unexpected => { - info!( - ?unexpected, - "unexpected startup packet, rejecting connection" - ); - stream - .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) - .await? - } - } -} - -async fn handle_client( - ctx: RequestContext, - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - stream: impl AsyncRead + AsyncWrite + Unpin, -) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; - - // Cut off first part of the SNI domain - // We receive required destination details in the format of - // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` - let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; - let dest: Vec<&str> = sni - .split_once('.') - .context("invalid SNI")? - .0 - .splitn(3, "--") - .collect(); - let port = dest[2].parse::().context("invalid port")?; - let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); - - info!("destination: {}", destination); - - let mut client = tokio::net::TcpStream::connect(destination).await?; - - // doesn't yet matter as pg-sni-router doesn't report analytics logs - ctx.set_success(); - ctx.log_connect(); - - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - - match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { - Ok(_) => Ok(()), - Err(ErrorSource::Client(err)) => Err(err).context("client"), - Err(ErrorSource::Compute(err)) => Err(err).context("compute"), - } + proxy::binary::pg_sni_router::run().await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index de685a82c6..7d4b44841d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,831 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::bail; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use proxy::cancellation::{handle_cancel_messages, CancellationHandler}; -use proxy::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, - ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, -}; -use proxy::context::parquet::ParquetUploadArgs; -use proxy::http::health_server::AppMetrics; -use proxy::metrics::Metrics; -use proxy::rate_limiter::{ - EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, -}; -use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::kv_ops::RedisKVClient; -use proxy::redis::{elasticache, notifications}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::GlobalConnPoolOptions; -use proxy::tls::client_config::compute_client_config_with_root_certs; -use proxy::{auth, control_plane, http, serverless, usage_metrics}; -use remote_storage::RemoteStorageConfig; -use tokio::net::TcpListener; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::sentry_init::init_sentry; -use utils::{project_build_tag, project_git_version}; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::{Parser, ValueEnum}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -#[derive(Clone, Debug, ValueEnum)] -enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, - - #[value(name("link"), alias("control-redirect"))] - ConsoleRedirect, - - #[cfg(feature = "testing")] - Postgres, -} - -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct ProxyCliArgs { - /// Name of the region this proxy is deployed in - #[clap(long, default_value_t = String::new())] - region: String, - /// listen for incoming client connections on ip:port - #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, - #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] - auth_backend: AuthBackendType, - /// listen for management callback connection on ip:port - #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, - /// listen for incoming http connections (metrics, etc) on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - http: String, - /// listen for incoming wss connections on ip:port - #[clap(long)] - wss: Option, - /// redirect unauthenticated users to the given uri in case of console redirect auth - #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] - uri: String, - /// cloud API endpoint for authenticating users - #[clap( - short, - long, - default_value = "http://localhost:3000/authenticate_proxy_request/" - )] - auth_endpoint: String, - /// JWT used to connect to control plane. - #[clap( - long, - value_name = "JWT", - default_value = "", - env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" - )] - control_plane_token: Arc, - /// if this is not local proxy, this toggles whether we accept jwt or passwords for http - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_auth_broker: bool, - /// path to TLS key for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, - /// path to TLS cert for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, - /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. - #[clap(long, alias = "allow-ssl-keylogfile")] - allow_tls_keylogfile: bool, - /// path to directory with TLS certificates for client postgres connections - #[clap(long)] - certs_dir: Option, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// http endpoint to receive periodic metric updates - #[clap(long)] - metric_collection_endpoint: Option, - /// how often metrics should be sent to a collection endpoint - #[clap(long)] - metric_collection_interval: Option, - /// cache for `wake_compute` api method (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - wake_compute_cache: String, - /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] - wake_compute_lock: String, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// timeout for scram authentication protocol - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - scram_protocol_timeout: tokio::time::Duration, - /// size of the threadpool for password hashing - #[clap(long, default_value_t = 4)] - scram_thread_pool_size: u8, - /// Endpoint rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - endpoint_rps_limit: Vec, - /// Wake compute rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] - wake_compute_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] - redis_rps_limit: Vec, - /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] - cancellation_ch_size: usize, - /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - allowed_ips_cache: String, - /// cache for `role_secret` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - role_secret_cache: String, - /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) - #[clap(long)] - redis_notifications: Option, - /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". - #[clap(long, default_value = "irsa")] - redis_auth_type: String, - /// redis host for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_host: Option, - /// redis port for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_port: Option, - /// redis cluster name, used in aws elasticache - #[clap(long)] - redis_cluster_name: Option, - /// redis user_id, used in aws elasticache - #[clap(long)] - redis_user_id: Option, - /// aws region to retrieve credentials - #[clap(long, default_value_t = String::new())] - aws_region: String, - /// cache for `project_info` (use `size=0` to disable) - #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] - project_info_cache: String, - /// cache for all valid endpoints - #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] - endpoint_cache_config: String, - #[clap(flatten)] - parquet_upload: ParquetUploadArgs, - - /// interval for backup metric collection - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - metric_backup_collection_interval: std::time::Duration, - /// remote storage configuration for backup metric collection - /// Encoded as toml (same format as pageservers), eg - /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, value_parser = remote_storage_from_toml)] - metric_backup_collection_remote_storage: Option, - /// chunk size for backup metric collection - /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. - #[clap(long, default_value = "4194304")] - metric_backup_collection_chunk_size: usize, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Whether to retry the wake_compute request - #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] - wake_compute_retry: String, - - /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_private_access_proxy: bool, - - /// Configure whether all incoming requests have a Proxy Protocol V2 packet. - // TODO(conradludgate): switch default to rejected or required once we've updated all deployments - #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] - proxy_protocol_v2: ProxyProtocolV2, - - /// Time the proxy waits for the webauth session to be confirmed by the control plane. - // TODO: rename to `console_redirect_confirmation_timeout`. - #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] - webauth_confirmation_timeout: std::time::Duration, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// timeout for http connection requests - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20)] - sql_over_http_pool_max_conns_per_endpoint: usize, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20000)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - /// Duration each shard will wait on average before a GC sweep. - /// A longer time will causes sweeps to take longer but will interfere less frequently. - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - sql_over_http_pool_gc_epoch: tokio::time::Duration, - - /// How many shards should the global pool have. Must be a power of two. - /// More shards will introduce less contention for pool operations, but can - /// increase memory used by the pool - #[clap(long, default_value_t = 128)] - sql_over_http_pool_shards: usize, - - #[clap(long, default_value_t = 10000)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 64)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - // TODO: refactor these to use labels - info!("Version: {GIT_VERSION}"); - info!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = ProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - match auth_backend { - Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), - Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), - }; - info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => Some( - ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), - ), - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) - } else { - regional_redis_client.clone() - }; - - // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; - - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; - - let proxy_listener = if !args.is_auth_broker { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); - - Some(TcpListener::bind(proxy_address).await?) - } else { - None - }; - - // TODO: rename the argument to something like serverless. - // It now covers more than just websockets, it also covers SQL over HTTP. - let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; - info!("Starting wss on {serverless_address}"); - Some(TcpListener::bind(serverless_address).await?) - } else if args.is_auth_broker { - bail!("wss arg must be present for auth-broker") - } else { - None - }; - - let cancellation_token = CancellationToken::new(); - - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); - RateBucketInfo::validate(redis_rps_limit)?; - - let redis_kv_client = regional_redis_client - .as_ref() - .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - - // channel size should be higher than redis client limit to avoid blocking - let cancel_ch_size = args.cancellation_ch_size; - let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); - let cancellation_handler = Arc::new(CancellationHandler::new( - &config.connect_to_compute, - Some(tx_cancel), - )); - - // bit of a hack - find the min rps and max rps supported and turn it into - // leaky bucket config instead - let max = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .max_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.max); - let rps = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .min_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.rps); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { rps, max }, - 64, - )); - - // client facing tasks. these will exit on error or on cancellation - // cancellation returns Ok(()) - let mut client_tasks = JoinSet::new(); - match auth_backend { - Either::Left(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - } - Either::Right(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::console_redirect_proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - )); - } - } - } - - client_tasks.spawn(proxy::context::parquet::worker( - cancellation_token.clone(), - args.parquet_upload, - )); - - // maintenance tasks. these never return unless there's an error - let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {})); - maintenance_tasks.spawn(http::health_server::task_main( - http_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); - - if let Some(metrics_config) = &config.metric_collection { - // TODO: Add gc regardles of the metric collection being enabled. - maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); - } - - if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { - if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } - - if let Some(mut redis_kv_client) = redis_kv_client { - maintenance_tasks.spawn(async move { - redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await - }); - } - - if let Some(regional_redis_client) = regional_redis_client { - let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; - let span = tracing::info_span!("endpoints_cache"); - maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } - .instrument(span), - ); - } - } - } - - let maintenance = loop { - // get one complete task - match futures::future::select( - pin!(maintenance_tasks.join_next()), - pin!(client_tasks.join_next()), - ) - .await - { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?, - // exit if all our client tasks have shutdown gracefully - Either::Right((None, _)) => return Ok(()), - } - }; - - // maintenance tasks return Infallible success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match maintenance {} -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let thread_pool = ThreadPool::new(args.scram_thread_pool_size); - Metrics::install(thread_pool.metrics.clone()); - - let tls_config = match (&args.tls_key, &args.tls_cert) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls( - key_path, - cert_path, - args.certs_dir.as_ref(), - args.allow_tls_keylogfile, - )?), - (None, None) => None, - _ => bail!("either both or neither tls-key and tls-cert must be specified"), - }; - - let backup_metric_collection_config = config::MetricBackupCollectionConfig { - interval: args.metric_backup_collection_interval, - remote_storage_config: args.metric_backup_collection_remote_storage.clone(), - chunk_size: args.metric_backup_collection_chunk_size, - }; - - let metric_collection = match ( - &args.metric_collection_endpoint, - &args.metric_collection_interval, - ) { - (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { - endpoint: endpoint.parse()?, - interval: humantime::parse_duration(interval)?, - backup_metric_collection_config, - }), - (None, None) => None, - _ => bail!( - "either both or neither metric-collection-endpoint \ - and metric-collection-interval must be specified" - ), - }; - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = control_plane::locks::ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: !args.is_auth_broker, - pool_options: GlobalConnPoolOptions { - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, - gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, - pool_shards: args.sql_over_http.sql_over_http_pool_shards, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: args.sql_over_http.sql_over_http_pool_opt_in, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - let authentication_config = AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool, - scram_protocol_timeout: args.scram_protocol_timeout, - rate_limiter_enabled: args.auth_rate_limit_enabled, - rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), - rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, - ip_allowlist_check_enabled: !args.is_private_access_proxy, - is_vpc_acccess_proxy: args.is_private_access_proxy, - is_auth_broker: args.is_auth_broker, - accept_jwts: args.is_auth_broker, - console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, - }; - - let compute_config = ComputeConfig { - retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, - tls: Arc::new(compute_client_config_with_root_certs()?), - timeout: Duration::from_secs(2), - }; - - let config = ProxyConfig { - tls_config, - metric_collection, - http_config, - authentication_config, - proxy_protocol_v2: args.proxy_protocol_v2, - handshake_timeout: args.handshake_timeout, - region: args.region.clone(), - wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, - connect_compute_locks, - connect_to_compute: compute_config, - }; - - let config = Box::leak(Box::new(config)); - - tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); - - Ok(config) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &ProxyCliArgs, -) -> anyhow::Result, &'static ConsoleRedirectBackend>> { - match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - tokio::spawn(locks.garbage_collect_worker()); - - let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - - let endpoint = http::Endpoint::new(url, http::new_client()); - - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - - let api = control_plane::client::ControlPlaneClient::ProxyV1(api); - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - #[cfg(feature = "testing")] - AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = control_plane::client::mock::MockControlPlane::new( - url, - !args.is_private_access_proxy, - ); - let api = control_plane::client::ControlPlaneClient::PostgresMock(api); - - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - AuthBackendType::ConsoleRedirect => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - - let url = args.uri.clone().parse()?; - let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(ep_url, http::new_client()); - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter - // and locks are not used in ConsoleRedirectBackend, - // but they are required by the NeonControlPlaneClient - let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - - let backend = ConsoleRedirectBackend::new(url, api); - let config = Box::leak(Box::new(backend)); - - Ok(Either::Right(config)) - } - } -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use clap::Parser; - use proxy::rate_limiter::RateBucketInfo; - - #[test] - fn parse_endpoint_rps_limit() { - let config = super::ProxyCliArgs::parse_from([ - "proxy", - "--endpoint-rps-limit", - "100@1s", - "--endpoint-rps-limit", - "20@30s", - ]); - - assert_eq!( - config.endpoint_rps_limit, - vec![ - RateBucketInfo::new(100, Duration::from_secs(1)), - RateBucketInfo::new(20, Duration::from_secs(30)), - ] - ); - } + proxy::binary::proxy::run().await } diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs new file mode 100644 index 0000000000..4ab11f828c --- /dev/null +++ b/proxy/src/binary/local_proxy.rs @@ -0,0 +1,410 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use compute_api::spec::LocalProxySpec; +use futures::future::Either; +use thiserror::Error; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use crate::auth::{self}; +use crate::cancellation::CancellationHandler; +use crate::config::{ + self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, +}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::http::health_server::AppMetrics; +use crate::intern::RoleNameInt; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, +}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::{self, GlobalConnPoolOptions}; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::types::RoleName; +use crate::url::ApiUrl; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct LocalProxyCliArgs { + /// listen for incoming metrics connections on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + metrics: String, + /// listen for incoming http connections on ip:port + #[clap(long)] + http: String, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// User rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + user_rps_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Address of the postgres server + #[clap(long, default_value = "127.0.0.1:5432")] + postgres: SocketAddr, + /// Address of the internal compute-ctl api service + #[clap(long, default_value = "http://127.0.0.1:3081/")] + compute_ctl: ApiUrl, + /// Path of the local proxy config file + #[clap(long, default_value = "./local_proxy.json")] + config_path: Utf8PathBuf, + /// Path of the local proxy PID file + #[clap(long, default_value = "./local_proxy.pid")] + pid_path: Utf8PathBuf, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 200)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + #[clap(long, default_value_t = 100)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 16)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init_local_proxy()?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + // TODO: refactor these to use labels + debug!("Version: {GIT_VERSION}"); + debug!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = LocalProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args); + + // before we bind to any ports, write the process ID to a file + // so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + // + // This also claims a "lock" that makes sure only one instance + // of local_proxy runs at a time. + let _process_guard = loop { + match pid_file::claim_for_current_process(&args.pid_path) { + Ok(guard) => break guard, + Err(e) => { + // compute-ctl might have tried to read the pid-file to let us + // know about some config change. We should try again. + error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; + + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; + let http_listener = TcpListener::bind(args.http).await?; + let shutdown = CancellationToken::new(); + + // todo: should scale with CU + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { + rps: 10.0, + max: 100.0, + }, + 16, + )); + + let mut maintenance_tasks = JoinSet::new(); + + let refresh_config_notify = Arc::new(Notify::new()); + maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), { + let refresh_config_notify = Arc::clone(&refresh_config_notify); + move || { + refresh_config_notify.notify_one(); + } + })); + + // trigger the first config load **after** setting up the signal hook + // to avoid the race condition where: + // 1. No config file registered when local_proxy starts up + // 2. The config file is written but the signal hook is not yet received + // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + + maintenance_tasks.spawn(crate::http::health_server::task_main( + metrics_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + + let task = serverless::task_main( + config, + auth_backend, + http_listener, + shutdown.clone(), + Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), + endpoint_rate_limiter, + ); + + match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {}, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((res, _)) => res?, + } + + Ok(()) +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + gc_epoch: Duration::from_secs(60), + pool_shards: 2, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: false, + + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + + let compute_config = ComputeConfig { + retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + Ok(Box::leak(Box::new(ProxyConfig { + tls_config: None, + metric_collection: None, + http_config, + authentication_config: AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool: ThreadPool::new(0), + scram_protocol_timeout: Duration::from_secs(10), + rate_limiter_enabled: false, + rate_limiter: BucketRateLimiter::new(vec![]), + rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, + is_auth_broker: false, + accept_jwts: true, + console_redirect_confirmation_timeout: Duration::ZERO, + }, + proxy_protocol_v2: config::ProxyProtocolV2::Rejected, + handshake_timeout: Duration::from_secs(10), + region: "local".into(), + wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, + connect_compute_locks, + connect_to_compute: compute_config, + }))) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> { + let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.postgres, args.compute_ctl.clone()), + )); + + Box::leak(Box::new(auth_backend)) +} + +#[derive(Error, Debug)] +enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), +} + +async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + let mut init = true; + loop { + rx.notified().await; + + match refresh_config_inner(&path).await { + Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } + + init = false; + } +} + +async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { + let bytes = tokio::fs::read(&path).await?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; + + let mut jwks_set = vec![]; + + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; + + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + Ok(JwksSettings { + id: jwks.id, + jwks_url, + _provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) + } + + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + + Ok(()) +} diff --git a/proxy/src/binary/mod.rs b/proxy/src/binary/mod.rs new file mode 100644 index 0000000000..dc07d3e675 --- /dev/null +++ b/proxy/src/binary/mod.rs @@ -0,0 +1,7 @@ +//! All binaries have the body of their main() defined here, so that the code +//! is also covered by code style configs in lib.rs and the unused-code check is +//! more effective when practically all modules are private to the lib. + +pub mod local_proxy; +pub mod pg_sni_router; +pub mod proxy; diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs new file mode 100644 index 0000000000..94e771a61c --- /dev/null +++ b/proxy/src/binary/pg_sni_router.rs @@ -0,0 +1,305 @@ +/// A stand-alone program that routes connections, e.g. from +/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +/// +/// This allows connecting to pods/services running in the same Kubernetes cluster from +/// the outside. Similar to an ingress controller for HTTPS. +use std::{net::SocketAddr, sync::Arc}; + +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; +use futures::future::Either; +use futures::TryFutureExt; +use itertools::Itertools; +use rustls::crypto::ring; +use rustls::pki_types::PrivateKeyDer; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; + +use crate::context::RequestContext; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::protocol2::ConnectionInfo; +use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; + +project_git_version!(GIT_VERSION); + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .version(GIT_VERSION) + .arg( + Arg::new("listen") + .short('l') + .long("listen") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .help("path to TLS key for client postgres connections") + .required(true), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .help("path to TLS cert for client postgres connections") + .required(true), + ) + .arg( + Arg::new("dest") + .short('d') + .long("destination") + .help("append this domain zone to the SNI hostname to get the destination address") + .required(true), + ) +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + let args = cli().get_matches(); + let destination: String = args + .get_one::("dest") + .expect("string argument defined") + .parse()?; + + // Configure TLS + let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( + args.get_one::("tls-key"), + args.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + (tls_config, tls_server_end_point) + } + _ => bail!("tls-key and tls-cert must be specified"), + }; + + // Start listening for incoming client connections + let proxy_address: SocketAddr = args + .get_one::("listen") + .expect("string argument defined") + .parse()?; + info!("Starting sni router on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let cancellation_token = CancellationToken::new(); + + let main = tokio::spawn(task_main( + Arc::new(destination), + tls_config, + tls_server_end_point, + proxy_listener, + cancellation_token.clone(), + )); + let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {})); + + // the signal task cant ever succeed. + // the main task can error, or can succeed on cancellation. + // we want to immediately exit on either of these cases + let signal = match futures::future::select(signals_task, main).await { + Either::Left((res, _)) => crate::error::flatten_err(res)?, + Either::Right((res, _)) => return crate::error::flatten_err(res), + }; + + // maintenance tasks return `Infallible` success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match signal {} +} + +async fn task_main( + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); + + connections.spawn( + async move { + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + info!(%peer_addr, "serving"); + let ctx = RequestContext::new( + session_id, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + crate::metrics::Protocol::SniRouter, + "sni", + ); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }) + .instrument(tracing::info_span!("handle_client", ?session_id)), + ); + } + + connections.close(); + drop(listener); + + connections.wait().await; + + info!("all client connections have finished"); + Ok(()) +} + +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +async fn ssl_handshake( + ctx: &RequestContext, + raw_stream: S, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, +) -> anyhow::Result> { + let mut stream = PqStream::new(Stream::from_raw(raw_stream)); + + let msg = stream.read_startup_packet().await?; + use pq_proto::FeStartupPacket::SslRequest; + + match msg { + SslRequest { direct: false } => { + stream + .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) + .await?; + + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empty. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + + Ok(Stream::Tls { + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), + tls_server_end_point, + }) + } + unexpected => { + info!( + ?unexpected, + "unexpected startup packet, rejecting connection" + ); + stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await? + } + } +} + +async fn handle_client( + ctx: RequestContext, + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + stream: impl AsyncRead + AsyncWrite + Unpin, +) -> anyhow::Result<()> { + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; + + // Cut off first part of the SNI domain + // We receive required destination details in the format of + // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` + let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; + let dest: Vec<&str> = sni + .split_once('.') + .context("invalid SNI")? + .0 + .splitn(3, "--") + .collect(); + let port = dest[2].parse::().context("invalid port")?; + let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); + + info!("destination: {}", destination); + + let mut client = tokio::net::TcpStream::connect(destination).await?; + + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log_connect(); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + + match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + Ok(_) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } +} diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs new file mode 100644 index 0000000000..b72799df54 --- /dev/null +++ b/proxy/src/binary/proxy.rs @@ -0,0 +1,829 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::bail; +use futures::future::Either; +use remote_storage::RemoteStorageConfig; +use tokio::net::TcpListener; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use crate::cancellation::{handle_cancel_messages, CancellationHandler}; +use crate::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; +use crate::context::parquet::ParquetUploadArgs; +use crate::http::health_server::AppMetrics; +use crate::metrics::Metrics; +use crate::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::redis::kv_ops::RedisKVClient; +use crate::redis::{elasticache, notifications}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::{auth, control_plane, http, serverless, usage_metrics}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::{Parser, ValueEnum}; + +#[derive(Clone, Debug, ValueEnum)] +enum AuthBackendType { + #[value(name("cplane-v1"), alias("control-plane"))] + ControlPlaneV1, + + #[value(name("link"), alias("control-redirect"))] + ConsoleRedirect, + + #[cfg(any(test, feature = "testing"))] + Postgres, +} + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct ProxyCliArgs { + /// Name of the region this proxy is deployed in + #[clap(long, default_value_t = String::new())] + region: String, + /// listen for incoming client connections on ip:port + #[clap(short, long, default_value = "127.0.0.1:4432")] + proxy: String, + #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] + auth_backend: AuthBackendType, + /// listen for management callback connection on ip:port + #[clap(short, long, default_value = "127.0.0.1:7000")] + mgmt: String, + /// listen for incoming http connections (metrics, etc) on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + http: String, + /// listen for incoming wss connections on ip:port + #[clap(long)] + wss: Option, + /// redirect unauthenticated users to the given uri in case of console redirect auth + #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] + uri: String, + /// cloud API endpoint for authenticating users + #[clap( + short, + long, + default_value = "http://localhost:3000/authenticate_proxy_request/" + )] + auth_endpoint: String, + /// JWT used to connect to control plane. + #[clap( + long, + value_name = "JWT", + default_value = "", + env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" + )] + control_plane_token: Arc, + /// if this is not local proxy, this toggles whether we accept jwt or passwords for http + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_auth_broker: bool, + /// path to TLS key for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'k', long, alias = "ssl-key")] + tls_key: Option, + /// path to TLS cert for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'c', long, alias = "ssl-cert")] + tls_cert: Option, + /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. + #[clap(long, alias = "allow-ssl-keylogfile")] + allow_tls_keylogfile: bool, + /// path to directory with TLS certificates for client postgres connections + #[clap(long)] + certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, + /// cache for `wake_compute` api method (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + wake_compute_cache: String, + /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// timeout for scram authentication protocol + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, + /// Endpoint rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] + redis_rps_limit: Vec, + /// Cancellation channel size (max queue size for redis kv client) + #[clap(long, default_value = "1024")] + cancellation_ch_size: usize, + /// cache for `allowed_ips` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + allowed_ips_cache: String, + /// cache for `role_secret` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + role_secret_cache: String, + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) + #[clap(long)] + redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, + /// cache for `project_info` (use `size=0` to disable) + #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] + project_info_cache: String, + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, + #[clap(flatten)] + parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, + + /// Configure whether all incoming requests have a Proxy Protocol V2 packet. + // TODO(conradludgate): switch default to rejected or required once we've updated all deployments + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + proxy_protocol_v2: ProxyProtocolV2, + + /// Time the proxy waits for the webauth session to be confirmed by the control plane. + // TODO: rename to `console_redirect_confirmation_timeout`. + #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] + webauth_confirmation_timeout: std::time::Duration, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + // TODO: refactor these to use labels + info!("Version: {GIT_VERSION}"); + info!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = ProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; + + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + } + info!("Using region: {}", args.aws_region); + + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), + ), + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache::CredentialsProvider::new( + args.aws_region, + args.redis_cluster_name, + args.redis_user_id, + ) + .await, + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; + + // Check that we can bind to address before further initialization + let http_address: SocketAddr = args.http.parse()?; + info!("Starting http on {http_address}"); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; + + let mgmt_address: SocketAddr = args.mgmt.parse()?; + info!("Starting mgmt on {mgmt_address}"); + let mgmt_listener = TcpListener::bind(mgmt_address).await?; + + let proxy_listener = if args.is_auth_broker { + None + } else { + let proxy_address: SocketAddr = args.proxy.parse()?; + info!("Starting proxy on {proxy_address}"); + + Some(TcpListener::bind(proxy_address).await?) + }; + + // TODO: rename the argument to something like serverless. + // It now covers more than just websockets, it also covers SQL over HTTP. + let serverless_listener = if let Some(serverless_address) = args.wss { + let serverless_address: SocketAddr = serverless_address.parse()?; + info!("Starting wss on {serverless_address}"); + Some(TcpListener::bind(serverless_address).await?) + } else if args.is_auth_broker { + bail!("wss arg must be present for auth-broker") + } else { + None + }; + + let cancellation_token = CancellationToken::new(); + + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + + let redis_kv_client = regional_redis_client + .as_ref() + .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); + + // channel size should be higher than redis client limit to avoid blocking + let cancel_ch_size = args.cancellation_ch_size; + let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); + let cancellation_handler = Arc::new(CancellationHandler::new( + &config.connect_to_compute, + Some(tx_cancel), + )); + + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); + + // client facing tasks. these will exit on error or on cancellation + // cancellation returns Ok(()) + let mut client_tasks = JoinSet::new(); + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } + } + + client_tasks.spawn(crate::context::parquet::worker( + cancellation_token.clone(), + args.parquet_upload, + )); + + // maintenance tasks. these never return unless there's an error + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {})); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); + + if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. + maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + } + + #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] + if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { + if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + + if let Some(mut redis_kv_client) = redis_kv_client { + maintenance_tasks.spawn(async move { + redis_kv_client.try_connect().await?; + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + }); + } + + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } + } + } + + let maintenance = loop { + // get one complete task + match futures::future::select( + pin!(maintenance_tasks.join_next()), + pin!(client_tasks.join_next()), + ) + .await + { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((Some(res), _)) => crate::error::flatten_err(res)?, + // exit if all our client tasks have shutdown gracefully + Either::Right((None, _)) => return Ok(()), + } + }; + + // maintenance tasks return Infallible success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match maintenance {} +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + + let tls_config = match (&args.tls_key, &args.tls_cert) { + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.certs_dir.as_ref(), + args.allow_tls_keylogfile, + )?), + (None, None) => None, + _ => bail!("either both or neither tls-key and tls-cert must be specified"), + }; + + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), + chunk_size: args.metric_backup_collection_chunk_size, + }; + + let metric_collection = match ( + &args.metric_collection_endpoint, + &args.metric_collection_interval, + ) { + (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { + endpoint: endpoint.parse()?, + interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, + }), + (None, None) => None, + _ => bail!( + "either both or neither metric-collection-endpoint \ + and metric-collection-interval must be specified" + ), + }; + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = control_plane::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: !args.is_auth_broker, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool, + scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_vpc_acccess_proxy: args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, + console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, + }; + + let compute_config = ComputeConfig { + retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + let config = ProxyConfig { + tls_config, + metric_collection, + http_config, + authentication_config, + proxy_protocol_v2: args.proxy_protocol_v2, + handshake_timeout: args.handshake_timeout, + region: args.region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute: compute_config, + }; + + let config = Box::leak(Box::new(config)); + + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + + Ok(config) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &ProxyCliArgs, +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { + AuthBackendType::ControlPlaneV1 => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + tokio::spawn(locks.garbage_collect_worker()); + + let url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let api = control_plane::client::ControlPlaneClient::ProxyV1(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = control_plane::client::mock::MockControlPlane::new( + url, + !args.is_private_access_proxy, + ); + let api = control_plane::client::ControlPlaneClient::PostgresMock(api); + + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::ConsoleRedirect => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + + let url = args.uri.clone().parse()?; + let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(ep_url, http::new_client()); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter + // and locks are not used in ConsoleRedirectBackend, + // but they are required by the NeonControlPlaneClient + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let backend = ConsoleRedirectBackend::new(url, api); + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use clap::Parser; + + use crate::rate_limiter::RateBucketInfo; + + #[test] + fn parse_endpoint_rps_limit() { + let config = super::ProxyCliArgs::parse_from([ + "proxy", + "--endpoint-rps-limit", + "100@1s", + "--endpoint-rps-limit", + "20@30s", + ]); + + assert_eq!( + config.endpoint_rps_limit, + vec![ + RateBucketInfo::new(100, Duration::from_secs(1)), + RateBucketInfo::new(20, Duration::from_secs(30)), + ] + ); + } +} diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index b5c42cd23d..8ec1a4648b 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -242,7 +242,7 @@ impl EndpointsCache { }); tracing::error!("error parsing value {value:?}: {err:?}"); } - }; + } } if total.is_power_of_two() { tracing::debug!("endpoints read {}", total); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 4d919f374a..1f9c8a48b7 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -69,17 +69,35 @@ pub async fn handle_cancel_messages( value, resp_tx, _guard, - expire: _, + expire, } => { + let res = client.hset(&key, field, value).await; if let Some(resp_tx) = resp_tx { - resp_tx - .send(client.hset(key, field, value).await) - .inspect_err(|e| { - tracing::debug!("failed to send StoreCancelKey response: {:?}", e); - }) - .ok(); + if res.is_ok() { + resp_tx + .send(client.expire(key, expire).await) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } else { + resp_tx + .send(res) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } + } else if res.is_ok() { + drop(client.expire(key, expire).await); } else { - drop(client.hset(key, field, value).await); + tracing::warn!("failed to store cancel key: {:?}", res); } } CancelKeyOp::GetCancelData { @@ -436,7 +454,7 @@ impl Session { &self.key } - // Send the store key op to the cancellation handler + // Send the store key op to the cancellation handler and set TTL for the key pub(crate) async fn write_cancel_key( &self, cancel_closure: CancelClosure, @@ -483,7 +501,7 @@ impl Session { _guard: Metrics::get() .proxy .cancel_channel_size - .guard(RedisMsgKind::HSet), + .guard(RedisMsgKind::HDel), }; let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index d71465765f..5447a4a4c0 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -137,8 +137,8 @@ impl ConnCfg { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. - "user" if self.user_is_set() => continue, - "database" if self.db_is_set() => continue, + "user" if self.user_is_set() => {} + "database" if self.db_is_set() => {} "options" => { if let Some(options) = filtered_options(v) { self.set_param(k, &options); diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs index 60fdf107d4..ab3179afb2 100644 --- a/proxy/src/compute_ctl/mod.rs +++ b/proxy/src/compute_ctl/mod.rs @@ -42,14 +42,14 @@ pub enum Privilege { #[derive(Error, Debug)] pub enum ComputeCtlError { #[error("connection error: {0}")] - ConnectionError(#[source] reqwest_middleware::Error), + Connection(#[source] reqwest_middleware::Error), #[error("request error [{status}]: {body:?}")] - RequestError { + Request { status: StatusCode, body: Option, }, #[error("response parsing error: {0}")] - ResponseError(#[source] reqwest::Error), + Response(#[source] reqwest::Error), } impl ComputeCtlApi { @@ -89,14 +89,14 @@ impl ComputeCtlApi { .json(req) .send() .await - .map_err(ComputeCtlError::ConnectionError)?; + .map_err(ComputeCtlError::Connection)?; let status = resp.status(); if status.is_client_error() || status.is_server_error() { let body = resp.json().await.ok(); - return Err(ComputeCtlError::RequestError { status, body }); + return Err(ComputeCtlError::Request { status, body }); } - resp.json().await.map_err(ComputeCtlError::ResponseError) + resp.json().await.map_err(ComputeCtlError::Response) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1dcd37712e..460e0cff54 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -151,7 +151,6 @@ impl FromStr for EndpointCacheConfig { } #[derive(Debug)] pub struct MetricBackupCollectionConfig { - pub interval: Duration, pub remote_storage_config: Option, pub chunk_size: usize, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index c4548a7ddd..1044f5f8e2 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -82,7 +82,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 4f1dd39d92..0537ae6a62 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -187,6 +187,10 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; + let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) .set_compression(config.parquet_upload_compression); @@ -220,18 +224,18 @@ pub async fn worker( let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); let rx_disconnect = rx_disconnect.map(RequestData::from); + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await + .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(remote_storage_config, rx, parquet_config), - worker_inner( - disconnect_events_storage_config, - rx_disconnect, - parquet_config_disconnect - ) + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) ) .map(|_| ()) } else { - worker_inner(remote_storage_config, rx, parquet_config).await + worker_inner(storage, rx, parquet_config).await } } @@ -247,32 +251,18 @@ struct ParquetConfig { test_remote_failures: u64, } -impl ParquetConfig { - async fn storage( - &self, - storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { - let storage = GenericRemoteStorage::from_config(storage_config) - .await - .context("remote storage init")?; - - #[cfg(any(test, feature = "testing"))] - if self.test_remote_failures > 0 { - return Ok(GenericRemoteStorage::unreliable_wrapper( - storage, - self.test_remote_failures, - )); - } - - Ok(storage) - } -} - async fn worker_inner( - storage_config: RemoteStorageConfig, + storage: GenericRemoteStorage, rx: impl Stream, config: ParquetConfig, ) -> anyhow::Result<()> { + #[cfg(any(test, feature = "testing"))] + let storage = if config.test_remote_failures > 0 { + GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) + } else { + storage + }; + let mut rx = std::pin::pin!(rx); let mut rows = Vec::with_capacity(config.rows_per_group); @@ -295,7 +285,7 @@ async fn worker_inner( } if len > config.file_size || force { last_upload = time::Instant::now(); - let file = upload_parquet(w, len, &storage_config, &config).await?; + let file = upload_parquet(w, len, &storage).await?; w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; len = 0; } @@ -308,7 +298,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _rtchk: Writer = upload_parquet(w, len, &storage_config, &config).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) @@ -350,8 +340,7 @@ where async fn upload_parquet( mut w: SerializedFileWriter>, len: i64, - storage_config: &RemoteStorageConfig, - config: &ParquetConfig, + storage: &GenericRemoteStorage, ) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() @@ -388,15 +377,6 @@ async fn upload_parquet( size, compression, "uploading request parquet file" ); - // A bug in azure-sdk means that the identity-token-file that expires after - // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage - // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh - // the storage token, but the identity token has now expired. - // - // - // To work around this, we recreate the storage every time. - let storage = config.storage(storage_config).await?; - let year = now.year(); let month = now.month(); let day = now.day(); @@ -451,8 +431,8 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use tokio::sync::mpsc; use tokio::time; @@ -579,11 +559,12 @@ mod tests { timeout: std::time::Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(30), }; - - worker_inner(remote_storage_config, rx, config) + let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await .unwrap(); + worker_inner(storage, rx, config).await.unwrap(); + let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() .filter_map(|entry| entry.ok()) diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index a06943726e..c28ff4789d 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -212,15 +212,15 @@ impl ApiLocks { timeout: Duration, epoch: std::time::Duration, metrics: &'static ApiLockMetrics, - ) -> prometheus::Result { - Ok(Self { + ) -> Self { + Self { name, node_locks: ClashMap::with_shard_amount(shards), config, timeout, epoch, metrics, - }) + } } pub(crate) async fn get_permit(&self, key: &K) -> Result { diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 5883d02b92..8d6b2e96f5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -361,7 +361,8 @@ pub struct EndpointJwksResponse { pub struct JwksSettings { pub id: String, pub jwks_url: url::Url, - pub provider_name: String, + #[serde(rename = "provider_name")] + pub _provider_name: String, pub jwt_audience: Option, pub role_names: Vec, } diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index f92e4f3f60..89ec4f9b33 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; -use crate::intern::AccountIdInt; -use crate::intern::ProjectIdInt; +use crate::intern::{AccountIdInt, ProjectIdInt}; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, scram}; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6ca091feb7..141f319567 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,16 +3,16 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; +use http_utils::endpoint::{self, request_span}; +use http_utils::error::ApiError; +use http_utils::json::json_response; +use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; use measured::text::BufferedTextEncoder; use measured::MetricGroup; use metrics::NeonMetrics; use tracing::{info, info_span}; -use utils::http::endpoint::{self, request_span}; -use utils::http::error::ApiError; -use utils::http::json::json_response; -use utils::http::{RouterBuilder, RouterService}; use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index c56474edd7..a9e5fbc85b 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -72,34 +72,36 @@ // List of temporarily allowed lints to unblock beta/nightly. #![allow(unknown_lints)] -pub mod auth; -pub mod cache; -pub mod cancellation; -pub mod compute; -pub mod compute_ctl; -pub mod config; -pub mod console_redirect_proxy; -pub mod context; -pub mod control_plane; -pub mod error; +pub mod binary; + +mod auth; +mod cache; +mod cancellation; +mod compute; +mod compute_ctl; +mod config; +mod console_redirect_proxy; +mod context; +mod control_plane; +mod error; mod ext; -pub mod http; -pub mod intern; -pub mod jemalloc; -pub mod logging; -pub mod metrics; -pub mod parse; -pub mod protocol2; -pub mod proxy; -pub mod rate_limiter; -pub mod redis; -pub mod sasl; -pub mod scram; -pub mod serverless; -pub mod signals; -pub mod stream; -pub mod tls; -pub mod types; -pub mod url; -pub mod usage_metrics; -pub mod waiters; +mod http; +mod intern; +mod jemalloc; +mod logging; +mod metrics; +mod parse; +mod protocol2; +mod proxy; +mod rate_limiter; +mod redis; +mod sasl; +mod scram; +mod serverless; +mod signals; +mod stream; +mod tls; +mod types; +mod url; +mod usage_metrics; +mod waiters; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 97c9f5a59c..fbd4811b54 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -7,9 +7,8 @@ use chrono::{DateTime, Utc}; use opentelemetry::trace::TraceContextExt; use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; -use tracing::span; use tracing::subscriber::Interest; -use tracing::{callsite, Event, Metadata, Span, Subscriber}; +use tracing::{callsite, span, Event, Metadata, Span, Subscriber}; use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 25bcc81108..f3447e063e 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -205,7 +205,7 @@ pub enum Protocol { } impl Protocol { - pub fn as_str(&self) -> &'static str { + pub fn as_str(self) -> &'static str { match self { Protocol::Http => "http", Protocol::Ws => "ws", @@ -385,6 +385,7 @@ pub enum Waiting { #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "kind")] +#[allow(clippy::enum_variant_names)] pub enum RedisMsgKind { HSet, HSetMultiple, diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 0dc97b7097..74a15d9bf4 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol( // if no more bytes available then exit if bytes_read == 0 { return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - }; + } // check if we have enough bytes to continue if let Some(header) = buf.try_get::() { @@ -169,7 +169,7 @@ fn process_proxy_payload( header.version_and_command ), )), - }; + } let size_err = "invalid proxy protocol length. payload not large enough to fit requested IP addresses"; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index dd145e6bb2..26fb1754bf 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -198,7 +198,7 @@ where warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } - }; + } let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 8a407c8119..2a406fcb34 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -118,7 +118,7 @@ pub async fn task_main( error!("per-client task finished with an error: failed to set socket option: {e:#}"); return; } - }; + } let ctx = RequestContext::new( session_id, diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 30d8b83e60..186fece4b2 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,9 +5,6 @@ use pq_proto::CancelKeyData; use tokio::sync::Mutex; use uuid::Uuid; -use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( @@ -79,36 +76,3 @@ impl CancellationPublisher for Arc> { .await } } - -pub struct RedisPublisherClient { - #[allow(dead_code)] - client: ConnectionWithCredentialsProvider, - _region_id: String, - _limiter: GlobalRateLimiter, -} - -impl RedisPublisherClient { - pub fn new( - client: ConnectionWithCredentialsProvider, - region_id: String, - info: &'static [RateBucketInfo], - ) -> anyhow::Result { - Ok(Self { - client, - _region_id: region_id, - _limiter: GlobalRateLimiter::new(info.into()), - }) - } - - #[allow(dead_code)] - pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.connect().await { - Ok(()) => {} - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e); - } - } - Ok(()) - } -} diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 1a7024588a..5f9f2509e2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -169,7 +169,7 @@ impl MessageHandler { }); tracing::error!("broken message: {e}"); } - }; + } return Ok(()); } Ok(msg) => msg, @@ -180,7 +180,7 @@ impl MessageHandler { match serde_json::from_str::(&payload) { Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), Err(_) => tracing::error!("broken message: {e}"), - }; + } return Ok(()); } }; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 0fb4a8a6cc..f35c375ba2 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -372,7 +372,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.execute("select auth.init()", &[]).await { + if let Err(e) = client.batch_execute("select auth.init();").await { discard.discard(); return Err(e.into()); } @@ -400,9 +400,9 @@ fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), - #[error("could not connection to postgres in compute")] + #[error("could not connect to postgres in compute")] PostgresConnectionError(#[from] postgres_client::Error), - #[error("could not connection to local-proxy in compute")] + #[error("could not connect to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), @@ -651,7 +651,7 @@ async fn connect_http2( e, ))); } - }; + } }; let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index d5c948777c..95a28663a5 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use http::{Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use serde::Serialize; -use utils::http::error::ApiError; /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { @@ -59,14 +59,14 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response Response> { Response::builder() .status(status) @@ -92,7 +92,7 @@ impl HttpErrorBody { } } -/// Same as [`utils::http::json::json_response`] +/// Same as [`http_utils::json::json_response`] pub(crate) fn json_response( status: StatusCode, data: T, diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index fe33f0ff65..137a2d6377 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -23,7 +23,6 @@ use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; -use postgres_client::types::ToSql; use postgres_client::AsyncMessage; use serde_json::value::RawValue; use tokio::net::TcpStream; @@ -280,14 +279,13 @@ impl ClientInnerCommon { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; - // initiates the auth session + // discard all cannot run in a transaction. must be executed alone. self.inner.batch_execute("discard all").await?; - self.inner - .execute( - "select auth.jwt_session_init($1)", - &[&&*token as &(dyn ToSql + Sync)], - ) - .await?; + + // initiates the auth session + // this is safe from query injections as the jwt format free of any escape characters. + let query = format!("select auth.jwt_session_init('{token}')"); + self.inner.batch_execute(&query).await?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6888772362..8289500159 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -28,6 +28,7 @@ use futures::TryFutureExt; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; @@ -41,7 +42,6 @@ use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 3e42787a09..5982fe225d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,21 +8,22 @@ use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; +use serde_json::value::RawValue; use serde_json::Value; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use typed_json::json; use url::Url; -use urlencoding; -use utils::http::error::ApiError; use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; @@ -249,6 +250,50 @@ pub(crate) async fn handle( let mut response = match result { Ok(r) => { ctx.set_success(); + + // Handling the error response from local proxy here + if config.authentication_config.is_auth_broker && r.status().is_server_error() { + let status = r.status(); + + let body_bytes = r + .collect() + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::Error::msg(format!( + "could not collect http body: {e}" + ))) + })? + .to_bytes(); + + if let Ok(mut json_map) = + serde_json::from_slice::>(&body_bytes) + { + let message = json_map.get("message"); + if let Some(message) = message { + let msg: String = match serde_json::from_str(message.get()) { + Ok(msg) => msg, + Err(_) => { + "Unable to parse the response message from server".to_string() + } + }; + + error!("Error response from local_proxy: {status} {msg}"); + + json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys + + let resp_json = serde_json::to_string(&json_map) + .unwrap_or("failed to serialize the response message".to_string()); + + return json_response(status, resp_json); + } + } + + error!("Unable to parse the response message from local_proxy"); + return json_response( + status, + json!({ "message": "Unable to parse the response message from server".to_string() }), + ); + } r } Err(e @ SqlOverHttpError::Cancelled(_)) => { @@ -618,8 +663,6 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { - let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); - let keys = match auth { AuthData::Password(pw) => { backend @@ -634,7 +677,9 @@ async fn handle_db_inner( }; let client = match keys.keys { - ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { + ComputeCredentialKeys::JwtPayload(payload) + if backend.auth_backend.is_local_proxy() => + { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; let (cli_inner, _dsc) = client.client_inner(); cli_inner.set_jwt_session(&payload).await?; diff --git a/pyproject.toml b/pyproject.toml index e299c421e9..92a660c233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,12 @@ Jinja2 = "^3.1.5" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" -boto3-stubs = {extras = ["s3"], version = "^1.26.16"} +boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"} moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" -pytest-timeout = "^2.1.0" +pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" diff --git a/pytest.ini b/pytest.ini index 7197b078c6..237066b1f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,7 +11,7 @@ markers = testpaths = test_runner minversion = 6.0 -log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s +log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true timeout = 300 diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0eb511f1cc..d12ebc1030 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -63,6 +63,7 @@ sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true +http-utils.workspace = true utils.workspace = true wal_decoder.workspace = true env_logger.workspace = true diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml index 6c5a52de3a..0b660aaf32 100644 --- a/safekeeper/client/Cargo.toml +++ b/safekeeper/client/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +http-utils.workspace = true safekeeper_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index f65bfaa6d5..40e5afc4aa 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -3,11 +3,14 @@ //! Partially copied from pageserver client; some parts might be better to be //! united. +use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus}; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; use std::error::Error as _; use utils::{ - http::error::HttpErrorBody, id::{NodeId, TenantId, TimelineId}, logging::SecretString, }; @@ -32,6 +35,9 @@ pub enum Error { /// Status is not ok; parsed error in body as `HttpErrorBody`. #[error("safekeeper API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -85,6 +91,12 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { + let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, @@ -124,9 +136,10 @@ impl Client { self.get(&uri).await } - pub async fn utilization(&self) -> Result { + pub async fn utilization(&self) -> Result { let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint); - self.get(&uri).await + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) } async fn post( diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index e92ca881e1..35aebfd8ad 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -235,7 +235,7 @@ impl Storage for FileStorage { #[cfg(test)] mod test { use super::*; - use safekeeper_api::membership::{Configuration, MemberSet}; + use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; @@ -246,7 +246,7 @@ mod test { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); state.mconf = Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::empty(), new_members: None, }; diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index d82a713f8a..6e160b7a5e 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -14,7 +14,7 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)?; server.serve(service).await?; Ok(()) // unreachable diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 7ec08ecf9a..cd2ac5f44c 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,6 +1,8 @@ +use http_utils::failpoints::failpoints_handler; use hyper::{Body, Request, Response, StatusCode}; use safekeeper_api::models; use safekeeper_api::models::AcceptorStateStatus; +use safekeeper_api::models::PullTimelineRequest; use safekeeper_api::models::SafekeeperStatus; use safekeeper_api::models::TermSwitchApiEntry; use safekeeper_api::models::TimelineStatus; @@ -17,25 +19,23 @@ use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ + +use http_utils::endpoint::{ profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, - ChannelWriter, }; -use utils::http::request::parse_query_param; +use http_utils::{ + endpoint::{self, auth_middleware, check_permission_with, ChannelWriter}, + error::ApiError, + json::{json_request, json_response}, + request::{ensure_no_body, parse_query_param, parse_request_param}, + RequestExt, RouterBuilder, +}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; use utils::{ auth::SwappableJwtAuth, - http::{ - endpoint::{self, auth_middleware, check_permission_with}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_request_param}, - RequestExt, RouterBuilder, - }, id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; @@ -231,7 +231,7 @@ async fn timeline_delete_handler(mut request: Request) -> Result) -> Result, ApiError> { check_permission(&request, None)?; - let data: pull_timeline::Request = json_request(&mut request).await?; + let data: PullTimelineRequest = json_request(&mut request).await?; let conf = get_conf(&request); let global_timelines = get_global_timelines(&request); @@ -627,7 +627,7 @@ pub fn make_router( failpoints_handler(r, cancel).await }) }) - .get("/v1/uzilization", |r| request_span(r, utilization_handler)) + .get("/v1/utilization", |r| request_span(r, utilization_handler)) .delete("/v1/tenant/:tenant_id", |r| { request_span(r, tenant_delete_handler) }) diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f2d8e4c85f..4827b73074 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -4,10 +4,13 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use safekeeper_api::{models::TimelineStatus, Term}; +use safekeeper_api::{ + models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}, + Term, +}; use safekeeper_client::mgmt_api; use safekeeper_client::mgmt_api::Client; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use std::{ cmp::min, io::{self, ErrorKind}, @@ -33,7 +36,7 @@ use crate::{ }; use utils::{ crashsafe::fsync_async_opt, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantTimelineId}, logging::SecretString, lsn::Lsn, pausable_failpoint, @@ -378,21 +381,6 @@ impl WalResidentTimeline { } } -/// pull_timeline request body. -#[derive(Debug, Deserialize)] -pub struct Request { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub http_hosts: Vec, -} - -#[derive(Debug, Serialize)] -pub struct Response { - // Donor safekeeper host - pub safekeeper_host: String, - // TODO: add more fields? -} - /// Response for debug dump request. #[derive(Debug, Deserialize)] pub struct DebugDumpResponse { @@ -405,10 +393,10 @@ pub struct DebugDumpResponse { /// Find the most advanced safekeeper and pull timeline from it. pub async fn handle_request( - request: Request, + request: PullTimelineRequest, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -460,7 +448,7 @@ async fn pull_timeline( host: String, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -535,7 +523,7 @@ async fn pull_timeline( .load_temp_timeline(ttid, &tli_dir_path, false) .await?; - Ok(Response { + Ok(PullTimelineResponse { safekeeper_host: host, }) } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 45e19c31b6..f816f8459a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1004,7 +1004,7 @@ mod tests { use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use safekeeper_api::{ - membership::{Configuration, MemberSet, SafekeeperId}, + membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId}, ServerInfo, }; @@ -1303,7 +1303,7 @@ mod tests { tenant_id, timeline_id, mconf: Configuration { - generation: 42, + generation: SafekeeperGeneration::new(42), members: MemberSet::new(vec![SafekeeperId { id: NodeId(1), host: "hehe.org".to_owned(), diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 5eb0bd7146..4341f13824 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -14,6 +14,7 @@ use tokio_util::sync::CancellationToken; use utils::id::TenantId; use utils::sync::gate::Gate; +use http_utils::error::ApiError; use std::cmp::max; use std::ops::{Deref, DerefMut}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; @@ -22,7 +23,6 @@ use std::time::Duration; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::{sync::watch, time::Instant}; use tracing::*; -use utils::http::error::ApiError; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, @@ -592,6 +592,8 @@ impl Timeline { assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); + info!("deleting timeline {} from disk", self.ttid); + // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 01c6aff6c3..1ff6a72bce 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -475,6 +475,8 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); timeline.shutdown().await; + info!("timeline {ttid} shut down for deletion"); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 8517fa0344..2f6b91cf47 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -310,9 +310,12 @@ impl WalBackupTask { retry_attempt = 0; } Err(e) => { + // We might have managed to upload some segment even though + // some later in the range failed, so log backup_lsn + // separately. error!( - "failed while offloading range {}-{}: {:?}", - backup_lsn, commit_lsn, e + "failed while offloading range {}-{}, backup_lsn {}: {:?}", + backup_lsn, commit_lsn, backup_lsn, e ); retry_attempt = retry_attempt.saturating_add(1); @@ -338,6 +341,13 @@ async fn backup_lsn_range( let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + info!( + "offloading segnos {:?} of range [{}-{})", + segments.iter().map(|&s| s.seg_no).collect::>(), + start_lsn, + end_lsn, + ); + // Pool of concurrent upload tasks. We use `FuturesOrdered` to // preserve order of uploads, and update `backup_lsn` only after // all previous uploads are finished. @@ -374,10 +384,10 @@ async fn backup_lsn_range( } info!( - "offloaded segnos {:?} up to {}, previous backup_lsn {}", + "offloaded segnos {:?} of range [{}-{})", segments.iter().map(|&s| s.seg_no).collect::>(), - end_lsn, start_lsn, + end_lsn, ); Ok(()) } diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ebcb060e7..e5ccbb3230 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; +use std::os::fd::AsRawFd; + use crate::metrics::TrafficMetrics; use crate::SafeKeeperConf; use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; @@ -62,6 +64,7 @@ async fn handle_socket( global_timelines: Arc, ) -> Result<(), QueryError> { socket.set_nodelay(true)?; + let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr()?; // Set timeout on reading from the socket. It prevents hanged up connection @@ -107,7 +110,7 @@ async fn handle_socket( auth_pair, global_timelines, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py new file mode 100644 index 0000000000..a2f553d290 --- /dev/null +++ b/scripts/generate_image_maps.py @@ -0,0 +1,58 @@ +import itertools +import json +import os + +build_tag = os.environ["BUILD_TAG"] +branch = os.environ["BRANCH"] +dev_acr = os.environ["DEV_ACR"] +prod_acr = os.environ["PROD_ACR"] + +components = { + "neon": ["neon"], + "compute": [ + "compute-node-v14", + "compute-node-v15", + "compute-node-v16", + "compute-node-v17", + "vm-compute-node-v14", + "vm-compute-node-v15", + "vm-compute-node-v16", + "vm-compute-node-v17", + ], +} + +registries = { + "dev": [ + "docker.io/neondatabase", + "369495373322.dkr.ecr.eu-central-1.amazonaws.com", + f"{dev_acr}.azurecr.io/neondatabase", + ], + "prod": [ + "093970136003.dkr.ecr.eu-central-1.amazonaws.com", + f"{prod_acr}.azurecr.io/neondatabase", + ], +} + +outputs: dict[str, dict[str, list[str]]] = {} + +target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] +target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] + +for component_name, component_images in components.items(): + for stage in target_stages: + outputs[f"{component_name}-{stage}"] = dict( + [ + ( + f"docker.io/neondatabase/{component_image}:{build_tag}", + [ + f"{combo[0]}/{component_image}:{combo[1]}" + for combo in itertools.product(registries[stage], target_tags) + ], + ) + for component_image in component_images + ] + ) + +with open(os.environ["GITHUB_OUTPUT"], "a") as f: + for key, value in outputs.items(): + f.write(f"{key}={json.dumps(value)}\n") diff --git a/scripts/push_with_image_map.py b/scripts/push_with_image_map.py new file mode 100644 index 0000000000..c68f6ad407 --- /dev/null +++ b/scripts/push_with_image_map.py @@ -0,0 +1,22 @@ +import json +import os +import subprocess + +image_map = os.getenv("IMAGE_MAP") +if not image_map: + raise ValueError("IMAGE_MAP environment variable is not set") + +try: + parsed_image_map: dict[str, list[str]] = json.loads(image_map) +except json.JSONDecodeError as e: + raise ValueError("Failed to parse IMAGE_MAP as JSON") from e + +for source, targets in parsed_image_map.items(): + for target in targets: + cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if result.returncode != 0: + print(f"Error: {result.stdout}") + raise RuntimeError(f"Command failed: {' '.join(cmd)}") diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 63f43cdf62..a93bbdeaaf 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -32,6 +32,9 @@ postgres_connection.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true +safekeeper_api.workspace = true +safekeeper_client.workspace = true +regex.workspace = true rustls-native-certs.workspace = true serde.workspace = true serde_json.workspace = true @@ -55,6 +58,7 @@ diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connec diesel_migrations = { version = "2.2.0" } scoped-futures = "0.1.4" +http-utils = { path = "../libs/http-utils/" } utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index b7e66d33eb..6f110d3294 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,6 +1,10 @@ use futures::{stream::FuturesUnordered, StreamExt}; +use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_client::mgmt_api; use std::{ collections::HashMap, + fmt::Debug, + future::Future, sync::Arc, time::{Duration, Instant}, }; @@ -9,15 +13,15 @@ use tokio_util::sync::CancellationToken; use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; use thiserror::Error; -use utils::id::NodeId; +use utils::{id::NodeId, logging::SecretString}; -use crate::node::Node; +use crate::{node::Node, safekeeper::Safekeeper}; -struct HeartbeaterTask { - receiver: tokio::sync::mpsc::UnboundedReceiver, +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver>, cancel: CancellationToken, - state: HashMap, + state: HashMap, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -36,8 +40,17 @@ pub(crate) enum PageserverState { Offline, } +#[derive(Debug, Clone)] +pub(crate) enum SafekeeperState { + Available { + last_seen_at: Instant, + utilization: SafekeeperUtilization, + }, + Offline, +} + #[derive(Debug)] -pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, State)>); #[derive(Debug, Error)] pub(crate) enum HeartbeaterError { @@ -45,23 +58,28 @@ pub(crate) enum HeartbeaterError { Cancel, } -struct HeartbeatRequest { - pageservers: Arc>, - reply: tokio::sync::oneshot::Sender>, +struct HeartbeatRequest { + servers: Arc>, + reply: tokio::sync::oneshot::Sender, HeartbeaterError>>, } -pub(crate) struct Heartbeater { - sender: tokio::sync::mpsc::UnboundedSender, +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender>, } -impl Heartbeater { +#[allow(private_bounds)] +impl Heartbeater +where + HeartbeaterTask: HeartBeat, +{ pub(crate) fn new( jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { - let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let (sender, receiver) = + tokio::sync::mpsc::unbounded_channel::>(); let mut heartbeater = HeartbeaterTask::new( receiver, jwt_token, @@ -76,12 +94,12 @@ impl Heartbeater { pub(crate) async fn heartbeat( &self, - pageservers: Arc>, - ) -> Result { + servers: Arc>, + ) -> Result, HeartbeaterError> { let (sender, receiver) = tokio::sync::oneshot::channel(); self.sender .send(HeartbeatRequest { - pageservers, + servers, reply: sender, }) .map_err(|_| HeartbeaterError::Cancel)?; @@ -93,9 +111,12 @@ impl Heartbeater { } } -impl HeartbeaterTask { +impl HeartbeaterTask +where + HeartbeaterTask: HeartBeat, +{ fn new( - receiver: tokio::sync::mpsc::UnboundedReceiver, + receiver: tokio::sync::mpsc::UnboundedReceiver>, jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -110,14 +131,13 @@ impl HeartbeaterTask { jwt_token, } } - async fn run(&mut self) { loop { tokio::select! { request = self.receiver.recv() => { match request { Some(req) => { - let res = self.heartbeat(req.pageservers).await; + let res = self.heartbeat(req.servers).await; req.reply.send(res).unwrap(); }, None => { return; } @@ -127,11 +147,20 @@ impl HeartbeaterTask { } } } +} +pub(crate) trait HeartBeat { + fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> impl Future, HeartbeaterError>> + Send; +} + +impl HeartBeat for HeartbeaterTask { async fn heartbeat( &mut self, pageservers: Arc>, - ) -> Result { + ) -> Result, HeartbeaterError> { let mut new_state = HashMap::new(); let mut heartbeat_futs = FuturesUnordered::new(); @@ -272,3 +301,121 @@ impl HeartbeaterTask { Ok(AvailablityDeltas(deltas)) } } + +impl HeartBeat for HeartbeaterTask { + async fn heartbeat( + &mut self, + safekeepers: Arc>, + ) -> Result, HeartbeaterError> { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, sk) in &*safekeepers { + heartbeat_futs.push({ + let jwt_token = self + .jwt_token + .as_ref() + .map(|t| SecretString::from(t.to_owned())); + let cancel = self.cancel.clone(); + + async move { + let response = sk + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 3, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let status = match response { + Ok(utilization) => SafekeeperState::Available { + last_seen_at: Instant::now(), + utilization, + }, + Err(mgmt_api::Error::Cancelled) => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + Err(_) => SafekeeperState::Offline, + }; + + Some((*node_id, status)) + } + }); + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + } + + let mut offline = 0; + for state in new_state.values() { + match state { + SafekeeperState::Offline { .. } => offline += 1, + SafekeeperState::Available { .. } => {} + } + } + + tracing::info!( + "Heartbeat round complete for {} safekeepers, {} offline", + new_state.len(), + offline + ); + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, sk_state) in new_state.iter_mut() { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(*node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &sk_state) { + (SafekeeperState::Offline, SafekeeperState::Offline) => {} + (SafekeeperState::Available { last_seen_at, .. }, SafekeeperState::Offline) => { + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + } + _ => { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + // This is a new node. Don't generate a delta for it. + deltas.push((*node_id, sk_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = sk_state.clone(); + } + Vacant(vac) => { + vac.insert(sk_state.clone()); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index ac890b008f..8994721267 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -8,6 +8,14 @@ use crate::reconciler::ReconcileError; use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; +use http_utils::{ + endpoint::{self, auth_middleware, check_permission_with, request_span}, + error::ApiError, + failpoints::failpoints_handler, + json::{json_request, json_response}, + request::{must_get_query_param, parse_query_param, parse_request_param}, + RequestExt, RouterBuilder, +}; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; @@ -29,20 +37,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; -use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; -use utils::id::{TenantId, TimelineId}; - -use utils::{ - http::{ - endpoint::{self}, - error::ApiError, - json::{json_request, json_response}, - RequestExt, RouterBuilder, - }, - id::NodeId, -}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, @@ -521,6 +516,35 @@ async fn handle_tenant_timeline_block_unblock_gc( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_download_heatmap_layers( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let concurrency: Option = parse_query_param(&req, "concurrency")?; + + service + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await?; + + json_response(StatusCode::OK, ()) +} + +// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters +// and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to +// compare to, so we can just filter out our well known ID format with regexes. +fn path_without_ids(path: &str) -> String { + static ID_REGEX: std::sync::OnceLock = std::sync::OnceLock::new(); + ID_REGEX + .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap()) + .replace_all(path, "") + .to_string() +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -556,10 +580,7 @@ async fn handle_tenant_timeline_passthrough( .metrics_group .storage_controller_passthrough_request_latency; - // This is a bit awkward. We remove the param from the request - // and join the words by '_' to get a label for the request. - let just_path = path.replace(&tenant_shard_str, ""); - let path_label = just_path + let path_label = path_without_ids(&path) .split('/') .filter(|token| !token.is_empty()) .collect::>() @@ -2075,6 +2096,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_download_heatmap_layers, + RequestName("v1_tenant_timeline_download_heatmap_layers"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -2094,3 +2125,16 @@ pub fn make_router( ) }) } + +#[cfg(test)] +mod test { + + use super::path_without_ids; + + #[test] + fn test_path_without_ids() { + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/"); + assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/"); + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index f5823935e1..5f2c081927 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -17,6 +17,8 @@ mod pageserver_client; mod peer_client; pub mod persistence; mod reconciler; +mod safekeeper; +mod safekeeper_client; mod scheduler; mod schema; pub mod service; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 659c088d51..ea6bc38e89 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -12,7 +12,8 @@ use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, - MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -75,10 +76,14 @@ struct Cli { #[arg(long)] split_threshold: Option, - /// Maximum number of reconcilers that may run in parallel + /// Maximum number of normal-priority reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, + /// Maximum number of high-priority reconcilers that may run in parallel + #[arg(long)] + priority_reconciler_concurrency: Option, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -289,6 +294,9 @@ async fn async_main() -> anyhow::Result<()> { reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + priority_reconciler_concurrency: args + .priority_reconciler_concurrency + .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, @@ -320,7 +328,7 @@ async fn async_main() -> anyhow::Result<()> { let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; - let router_service = utils::http::RouterService::new(router).unwrap(); + let router_service = http_utils::RouterService::new(router).unwrap(); // Start HTTP server let server_shutdown = CancellationToken::new(); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 4164e3dc2b..6d67e0d130 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -80,6 +80,11 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_error: measured::CounterVec, + /// Count of HTTP requests to the safekeeper that resulted in an error, + /// broken down by the safekeeper node id, request name and method + pub(crate) storage_controller_safekeeper_request_error: + measured::CounterVec, + /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. @@ -87,6 +92,13 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, + /// Latency of HTTP requests to the safekeeper, broken down by safekeeper + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_safekeeper_request_latency: + measured::HistogramVec, + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_passthrough_request_error: diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 141ff6f720..645cbdfce1 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -280,6 +280,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + measured_request!( + "download_heatmap_layers", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index ee4eb55294..1a15bae365 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -6,9 +6,10 @@ use std::error::Error as _; use std::time::Duration; use tokio_util::sync::CancellationToken; +use http_utils::error::HttpErrorBody; use hyper::Uri; use reqwest::{StatusCode, Url}; -use utils::{backoff, http::error::HttpErrorBody}; +use utils::backoff; #[derive(Debug, Clone)] pub(crate) struct PeerClient { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index c4e5b39589..67b60eadf3 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1185,23 +1185,6 @@ impl Persistence { Ok(safekeepers) } - pub(crate) async fn safekeeper_get( - &self, - id: i64, - ) -> Result { - use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; - self.with_conn(move |conn| { - Box::pin(async move { - Ok(safekeepers - .filter(id_column.eq(&id)) - .select(SafekeeperPersistence::as_select()) - .get_result(conn) - .await?) - }) - }) - .await - } - pub(crate) async fn safekeeper_upsert( &self, record: SafekeeperUpsert, @@ -1554,6 +1537,21 @@ pub(crate) struct SafekeeperPersistence { } impl SafekeeperPersistence { + pub(crate) fn from_upsert( + upsert: SafekeeperUpsert, + scheduling_policy: SkSchedulingPolicy, + ) -> Self { + crate::persistence::SafekeeperPersistence { + id: upsert.id, + region_id: upsert.region_id, + version: upsert.version, + host: upsert.host, + port: upsert.port, + http_port: upsert.http_port, + availability_zone_id: upsert.availability_zone_id, + scheduling_policy: String::from(scheduling_policy), + } + } pub(crate) fn as_describe_response(&self) -> Result { let scheduling_policy = SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 58bc0ba1cd..48f0804926 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,7 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::{compute_hook, service}; -use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy}; +use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, }; @@ -91,9 +91,10 @@ pub(crate) struct ReconcilerConfigBuilder { } impl ReconcilerConfigBuilder { - pub(crate) fn new() -> Self { + /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default + pub(crate) fn new(priority: ReconcilerPriority) -> Self { Self { - config: ReconcilerConfig::default(), + config: ReconcilerConfig::new(priority), } } @@ -129,8 +130,18 @@ impl ReconcilerConfigBuilder { } } -#[derive(Default, Debug, Copy, Clone)] +// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling +// things on node changes) does not starve user-facing tasks. +#[derive(Debug, Copy, Clone)] +pub(crate) enum ReconcilerPriority { + Normal, + High, +} + +#[derive(Debug, Copy, Clone)] pub(crate) struct ReconcilerConfig { + pub(crate) priority: ReconcilerPriority, + // During live migration give up on warming-up the secondary // after this timeout. secondary_warmup_timeout: Option, @@ -145,6 +156,18 @@ pub(crate) struct ReconcilerConfig { } impl ReconcilerConfig { + /// Configs are always constructed with an explicit priority, to force callers to think about whether + /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because + /// scheduling something user-facing at normal priority can result in it getting starved out by background work. + pub(crate) fn new(priority: ReconcilerPriority) -> Self { + Self { + priority, + secondary_warmup_timeout: None, + secondary_download_request_timeout: None, + tenant_creation_hint: false, + } + } + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); self.secondary_warmup_timeout @@ -162,6 +185,24 @@ impl ReconcilerConfig { } } +impl From<&MigrationConfig> for ReconcilerConfig { + fn from(value: &MigrationConfig) -> Self { + // Run reconciler at high priority because MigrationConfig comes from human requests that should + // be presumed urgent. + let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High); + + if let Some(timeout) = value.secondary_warmup_timeout { + builder = builder.secondary_warmup_timeout(timeout) + } + + if let Some(timeout) = value.secondary_download_request_timeout { + builder = builder.secondary_download_request_timeout(timeout) + } + + builder.build() + } +} + /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O pub(crate) struct ReconcileUnits { _sem_units: tokio::sync::OwnedSemaphorePermit, diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs new file mode 100644 index 0000000000..be073d0cb9 --- /dev/null +++ b/storage_controller/src/safekeeper.rs @@ -0,0 +1,139 @@ +use std::{str::FromStr, time::Duration}; + +use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use reqwest::StatusCode; +use safekeeper_client::mgmt_api; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId, logging::SecretString}; + +use crate::{ + heartbeater::SafekeeperState, + persistence::{DatabaseError, SafekeeperPersistence}, + safekeeper_client::SafekeeperClient, +}; + +#[derive(Clone)] +pub struct Safekeeper { + pub(crate) skp: SafekeeperPersistence, + cancel: CancellationToken, + listen_http_addr: String, + listen_http_port: u16, + id: NodeId, + availability: SafekeeperState, +} + +impl Safekeeper { + pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { + Self { + cancel, + listen_http_addr: skp.host.clone(), + listen_http_port: skp.http_port as u16, + id: NodeId(skp.id as u64), + skp, + availability: SafekeeperState::Offline, + } + } + pub(crate) fn base_url(&self) -> String { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } + + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + pub(crate) fn describe_response(&self) -> Result { + self.skp.as_describe_response() + } + pub(crate) fn set_availability(&mut self, availability: SafekeeperState) { + self.availability = availability; + } + /// Perform an operation (which is given a [`SafekeeperClient`]) with retries + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> mgmt_api::Result + where + O: FnMut(SafekeeperClient) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = SafekeeperClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.clone(), + ); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to node {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + .unwrap_or(Err(mgmt_api::Error::Cancelled)) + } + + pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) { + let crate::persistence::SafekeeperUpsert { + active: _, + availability_zone_id: _, + host, + http_port, + id, + port: _, + region_id: _, + version: _, + } = record.clone(); + if id != self.id.0 as i64 { + // The way the function is called ensures this. If we regress on that, it's a bug. + panic!( + "id can't be changed via update_from_record function: {id} != {}", + self.id.0 + ); + } + self.skp = crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(), + ); + self.listen_http_port = http_port as u16; + self.listen_http_addr = host; + } +} diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs new file mode 100644 index 0000000000..f234ab3429 --- /dev/null +++ b/storage_controller/src/safekeeper_client.rs @@ -0,0 +1,121 @@ +use crate::metrics::PageserverRequestLabelGroup; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; +use safekeeper_client::mgmt_api::{Client, Result}; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + logging::SecretString, +}; + +/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +/// +/// Analogous to [`crate::pageserver_client::PageserverClient`]. +#[derive(Debug, Clone)] +pub(crate) struct SafekeeperClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl SafekeeperClient { + #[allow(dead_code)] + pub(crate) fn new( + node_id: NodeId, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + #[allow(dead_code)] + pub(crate) async fn create_timeline( + &self, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "create_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.create_timeline(req).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "delete_timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.delete_timeline(tenant_id, timeline_id).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn pull_timeline( + &self, + req: &PullTimelineRequest, + ) -> Result { + measured_request!( + "pull_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.pull_timeline(req).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.utilization().await + ) + } +} diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index f9e72862ae..106a7b2699 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,9 +1,10 @@ use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; +use http_utils::error::ApiError; use itertools::Itertools; use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; use std::{collections::HashMap, fmt::Debug}; -use utils::{http::error::ApiError, id::NodeId}; +use utils::id::NodeId; /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4028cd7023..5aa744f076 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2,6 +2,7 @@ pub mod chaos_injector; mod context_iterator; use hyper::Uri; +use safekeeper_api::models::SafekeeperUtilization; use std::{ borrow::Cow, cmp::Ordering, @@ -20,6 +21,7 @@ use crate::{ }, compute_hook::{self, NotifyError}, drain_utils::{self, TenantShardDrain, TenantShardIterator}, + heartbeater::SafekeeperState, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, leadership::Leadership, metrics, @@ -28,7 +30,11 @@ use crate::{ AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, ShardGenerationState, TenantFilter, }, - reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, + reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, + ReconcilerPriority, + }, + safekeeper::Safekeeper, scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, tenant_shard::{ MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, @@ -61,6 +67,7 @@ use reqwest::StatusCode; use tracing::{instrument, Instrument}; use crate::pageserver_client::PageserverClient; +use http_utils::error::ApiError; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, @@ -75,13 +82,12 @@ use pageserver_api::{ }, }; use pageserver_client::{mgmt_api, BlockUnblock}; -use tokio::sync::mpsc::error::TrySendError; +use tokio::sync::{mpsc::error::TrySendError, TryAcquireError}; use tokio_util::sync::CancellationToken; use utils::{ completion::Barrier, failpoint_support, generation::Generation, - http::error::ApiError, id::{NodeId, TenantId, TimelineId}, pausable_failpoint, sync::gate::Gate, @@ -156,6 +162,7 @@ enum TenantOperations { TimelineDetachAncestor, TimelineGcBlockUnblock, DropDetached, + DownloadHeatmapLayers, } #[derive(Clone, strum_macros::Display)] @@ -192,6 +199,7 @@ pub(crate) enum LeadershipStatus { } pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; +pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -206,6 +214,8 @@ struct ServiceState { nodes: Arc>, + safekeepers: Arc>, + scheduler: Scheduler, /// Ongoing background operation on the cluster if any is running. @@ -272,6 +282,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { impl ServiceState { fn new( nodes: HashMap, + safekeepers: HashMap, tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, @@ -283,6 +294,7 @@ impl ServiceState { leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), + safekeepers: Arc::new(safekeepers), scheduler, ongoing_operation: None, delayed_reconcile_rx, @@ -299,6 +311,23 @@ impl ServiceState { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + #[allow(clippy::type_complexity)] + fn parts_mut_sk( + &mut self, + ) -> ( + &mut Arc>, + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + ( + &mut self.nodes, + &mut self.safekeepers, + &mut self.tenants, + &mut self.scheduler, + ) + } + fn get_leadership_status(&self) -> LeadershipStatus { self.leadership_status } @@ -342,9 +371,12 @@ pub struct Config { /// and/or upon handling the re-attach request from a node. pub max_warming_up_interval: Duration, - /// How many Reconcilers may be spawned concurrently + /// How many normal-priority Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, + /// How many high-priority Reconcilers may be spawned concurrently + pub priority_reconciler_concurrency: usize, + /// How large must a shard grow in bytes before we split it? /// None disables auto-splitting. pub split_threshold: Option, @@ -397,7 +429,8 @@ pub struct Service { compute_hook: Arc, result_tx: tokio::sync::mpsc::UnboundedSender, - heartbeater: Heartbeater, + heartbeater_ps: Heartbeater, + heartbeater_sk: Heartbeater, // Channel for background cleanup from failed operations that require cleanup, such as shard split abort_tx: tokio::sync::mpsc::UnboundedSender, @@ -411,9 +444,14 @@ pub struct Service { // that transition it to/from Active. node_op_locks: IdLockMap, - // Limit how many Reconcilers we will spawn concurrently + // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations + // and reconciliation on startup. reconciler_concurrency: Arc, + // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which + // a human user might be waiting for. + priority_reconciler_concurrency: Arc, + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile /// Send into this queue to promptly attempt to reconcile this shard next time units are available. /// @@ -607,7 +645,8 @@ impl Service { let locked = self.inner.read().unwrap(); locked.nodes.clone() }; - let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; + let (mut nodes_online, mut sks_online) = + self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -616,7 +655,7 @@ impl Service { tracing::info!("Populating tenant shards' states from initial pageserver scan..."); let shard_count = { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (nodes, safekeepers, tenants, scheduler) = locked.parts_mut_sk(); // Mark nodes online if they responded to us: nodes are offline by default after a restart. let mut new_nodes = (**nodes).clone(); @@ -628,6 +667,17 @@ impl Service { } *nodes = Arc::new(new_nodes); + let mut new_sks = (**safekeepers).clone(); + for (node_id, node) in new_sks.iter_mut() { + if let Some((utilization, last_seen_at)) = sks_online.remove(node_id) { + node.set_availability(SafekeeperState::Available { + utilization, + last_seen_at, + }); + } + } + *safekeepers = Arc::new(new_sks); + for (tenant_shard_id, observed_state) in observed.0 { let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { for node_id in observed_state.locations.keys() { @@ -736,7 +786,10 @@ impl Service { async fn initial_heartbeat_round<'a>( &self, node_ids: impl Iterator, - ) -> HashMap { + ) -> ( + HashMap, + HashMap, + ) { assert!(!self.startup_complete.is_ready()); let all_nodes = { @@ -756,14 +809,20 @@ impl Service { } } + let all_sks = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + tracing::info!("Sending initial heartbeats..."); - let res = self - .heartbeater + let res_ps = self + .heartbeater_ps .heartbeat(Arc::new(nodes_to_heartbeat)) .await; + let res_sk = self.heartbeater_sk.heartbeat(all_sks).await; let mut online_nodes = HashMap::new(); - if let Ok(deltas) = res { + if let Ok(deltas) = res_ps { for (node_id, status) in deltas.0 { match status { PageserverState::Available { utilization, .. } => { @@ -777,7 +836,22 @@ impl Service { } } - online_nodes + let mut online_sks = HashMap::new(); + if let Ok(deltas) = res_sk { + for (node_id, status) in deltas.0 { + match status { + SafekeeperState::Available { + utilization, + last_seen_at, + } => { + online_sks.insert(node_id, (utilization, last_seen_at)); + } + SafekeeperState::Offline => {} + } + } + } + + (online_nodes, online_sks) } /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. @@ -984,8 +1058,14 @@ impl Service { locked.nodes.clone() }; - let res = self.heartbeater.heartbeat(nodes).await; - if let Ok(deltas) = res { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + let res_ps = self.heartbeater_ps.heartbeat(nodes).await; + let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await; + if let Ok(deltas) = res_ps { let mut to_handle = Vec::default(); for (node_id, state) in deltas.0 { @@ -1086,6 +1166,18 @@ impl Service { } } } + if let Ok(deltas) = res_sk { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + for (id, state) in deltas.0 { + let Some(sk) = safekeepers.get_mut(&id) else { + tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"); + continue; + }; + sk.set_availability(state); + } + locked.safekeepers = Arc::new(safekeepers); + } } } @@ -1184,12 +1276,15 @@ impl Service { } // Maybe some other work can proceed now that this job finished. + // + // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these + // reconciles are scheduled at `[ReconcilerPriority::Normal]`). if self.reconciler_concurrency.available_permits() > 0 { while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { let (nodes, tenants, _scheduler) = locked.parts_mut(); if let Some(shard) = tenants.get_mut(&tenant_shard_id) { shard.delayed_reconcile = false; - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } if self.reconciler_concurrency.available_permits() == 0 { @@ -1311,6 +1406,17 @@ impl Service { .storage_controller_pageserver_nodes .set(nodes.len() as i64); + tracing::info!("Loading safekeepers from database..."); + let safekeepers = persistence + .list_safekeepers() + .await? + .into_iter() + .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new())) + .collect::>(); + let safekeepers: HashMap = + safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; tracing::info!( @@ -1437,7 +1543,14 @@ impl Service { let cancel = CancellationToken::new(); let reconcilers_cancel = cancel.child_token(); - let heartbeater = Heartbeater::new( + let heartbeater_ps = Heartbeater::new( + config.jwt_token.clone(), + config.max_offline_interval, + config.max_warming_up_interval, + cancel.clone(), + ); + + let heartbeater_sk = Heartbeater::new( config.jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, @@ -1453,6 +1566,7 @@ impl Service { let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, + safekeepers, tenants, scheduler, delayed_reconcile_rx, @@ -1462,10 +1576,14 @@ impl Service { persistence, compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, - heartbeater, + heartbeater_ps, + heartbeater_sk, reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( config.reconciler_concurrency, )), + priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.priority_reconciler_concurrency, + )), delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), @@ -2238,7 +2356,7 @@ impl Service { let waiters = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); - let config = ReconcilerConfigBuilder::new() + let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High) .tenant_creation_hint(true) .build(); tenants @@ -2713,7 +2831,8 @@ impl Service { shard.schedule(scheduler, &mut schedule_context)?; - let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + let maybe_waiter = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); if let Some(waiter) = maybe_waiter { waiters.push(waiter); } @@ -2834,7 +2953,9 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { shard.config = config.clone(); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -3116,7 +3237,9 @@ impl Service { debug_assert!(shard.intent.get_attached().is_none()); debug_assert!(shard.intent.get_secondary().is_empty()); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { detach_waiters.push(waiter); } } @@ -3268,7 +3391,7 @@ impl Service { // In case scheduling is being switched back on, try it now. shard.schedule(scheduler, &mut schedule_context).ok(); - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } Ok(()) @@ -3635,6 +3758,61 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::DownloadHeatmapLayers, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = if tenant_shard_id.is_unsharded() { + TenantShardId::tenant_range(tenant_shard_id.tenant_id) + } else { + tenant_shard_id.range() + }; + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + self.tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + Ok(()) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -4317,7 +4495,7 @@ impl Service { tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } // We don't expect any new_shard_count shards to exist here, but drop them just in case @@ -4483,7 +4661,11 @@ impl Service { tracing::warn!("Failed to schedule child shard {child}: {e}"); } // In the background, attach secondary locations for the new shards - if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + if let Some(waiter) = self.maybe_reconcile_shard( + &mut child_state, + nodes, + ReconcilerPriority::High, + ) { waiters.push(waiter); } @@ -4848,7 +5030,9 @@ impl Service { shard.intent.clear_secondary(scheduler); // Run Reconciler to execute detach fo secondary locations. - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -5114,7 +5298,12 @@ impl Service { shard.sequence = shard.sequence.next(); } - self.maybe_reconcile_shard(shard, nodes) + let reconciler_config = match migrate_req.migration_config { + Some(cfg) => (&cfg).into(), + None => ReconcilerConfig::new(ReconcilerPriority::High), + }; + + self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config) }; if let Some(waiter) = waiter { @@ -5177,7 +5366,7 @@ impl Service { ); } - self.maybe_reconcile_shard(shard, nodes) + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) }; if let Some(waiter) = waiter { @@ -5589,7 +5778,7 @@ impl Service { ) } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } // Here we remove an existing observed location for the node we're removing, and it will @@ -5958,7 +6147,14 @@ impl Service { tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { - if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { + if self + .maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ) + .is_some() + { tenants_affected += 1; }; } @@ -5989,7 +6185,11 @@ impl Service { if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_shard, nodes); + self.maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ); } } } @@ -6353,8 +6553,36 @@ impl Service { &self, shard: &mut TenantShard, nodes: &Arc>, + priority: ReconcilerPriority, ) -> Option { - self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority)) + } + + /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority) + fn get_reconciler_units( + &self, + priority: ReconcilerPriority, + ) -> Result { + let units = match priority { + ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(), + ReconcilerPriority::High => { + match self + .priority_reconciler_concurrency + .clone() + .try_acquire_owned() + { + Ok(u) => Ok(u), + Err(TryAcquireError::NoPermits) => { + // If the high priority semaphore is exhausted, then high priority tasks may steal units from + // the normal priority semaphore. + self.reconciler_concurrency.clone().try_acquire_owned() + } + Err(e) => Err(e), + } + } + }; + + units.map(ReconcileUnits::new) } /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], @@ -6374,8 +6602,8 @@ impl Service { } }; - let units = match self.reconciler_concurrency.clone().try_acquire_owned() { - Ok(u) => ReconcileUnits::new(u), + let units = match self.get_reconciler_units(reconciler_config.priority) { + Ok(u) => u, Err(_) => { tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), "Concurrency limited: enqueued for reconcile later"); @@ -6468,7 +6696,10 @@ impl Service { // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another rone - if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + if self + .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } else if shard.delayed_reconcile { // Shard wanted to reconcile but for some reason couldn't. @@ -6554,7 +6785,10 @@ impl Service { tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; - if self.maybe_reconcile_shard(shard, nodes).is_some() { + if self + .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } } @@ -7104,7 +7338,7 @@ impl Service { // to not stall the operation when a cold secondary is encountered. const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -7437,7 +7671,7 @@ impl Service { ) -> Result<(), OperationError> { const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -7661,29 +7895,54 @@ impl Service { pub(crate) async fn safekeepers_list( &self, ) -> Result, DatabaseError> { - self.persistence - .list_safekeepers() - .await? - .into_iter() - .map(|v| v.as_describe_response()) - .collect::, _>>() + let locked = self.inner.read().unwrap(); + let mut list = locked + .safekeepers + .iter() + .map(|sk| sk.1.describe_response()) + .collect::, _>>()?; + list.sort_by_key(|v| v.id); + Ok(list) } pub(crate) async fn get_safekeeper( &self, id: i64, ) -> Result { - self.persistence - .safekeeper_get(id) - .await - .and_then(|v| v.as_describe_response()) + let locked = self.inner.read().unwrap(); + let sk = locked + .safekeepers + .get(&NodeId(id as u64)) + .ok_or(diesel::result::Error::NotFound)?; + sk.describe_response() } pub(crate) async fn upsert_safekeeper( &self, record: crate::persistence::SafekeeperUpsert, ) -> Result<(), DatabaseError> { - self.persistence.safekeeper_upsert(record).await + let node_id = NodeId(record.id as u64); + self.persistence.safekeeper_upsert(record.clone()).await?; + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + match safekeepers.entry(node_id) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.get_mut().update_from_record(record); + } + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + )); + } + } + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) } pub(crate) async fn set_safekeeper_scheduling_policy( @@ -7693,7 +7952,20 @@ impl Service { ) -> Result<(), DatabaseError> { self.persistence .set_safekeeper_scheduling_policy(id, scheduling_policy) - .await + .await?; + let node_id = NodeId(id as u64); + // After the change has been persisted successfully, update the in-memory state + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.skp.scheduling_policy = String::from(scheduling_policy); + + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) } pub(crate) async fn update_shards_preferred_azs( diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 91d7183fde..aa0ee0df5a 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -88,7 +88,11 @@ impl ChaosInjector { shard.intent.demote_attached(scheduler, old_location); shard.intent.promote_attached(scheduler, new_location); - self.service.maybe_reconcile_shard(shard, nodes); + self.service.maybe_reconcile_shard( + shard, + nodes, + crate::reconciler::ReconcilerPriority::Normal, + ); } async fn inject_chaos(&mut self) { diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 33f01f80fb..425abef935 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + + # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200 + fut.result() return Response(status=200) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 6e8210e978..cdc162fca2 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -9,21 +9,23 @@ from requests.adapters import HTTPAdapter class EndpointHttpClient(requests.Session): def __init__( self, - port: int, + external_port: int, + internal_port: int, ): super().__init__() - self.port = port + self.external_port: int = external_port + self.internal_port: int = internal_port self.mount("http://", HTTPAdapter()) def dbs_and_roles(self): - res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles") res.raise_for_status() return res.json() def database_schema(self, database: str): res = self.get( - f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}" + f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}" ) res.raise_for_status() return res.text @@ -34,20 +36,20 @@ class EndpointHttpClient(requests.Session): "version": version, "database": database, } - res = self.post(f"http://localhost:{self.port}/extensions", json=body) + res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body) res.raise_for_status() return res.json() def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]): res = self.post( - f"http://localhost:{self.port}/grants", + f"http://localhost:{self.internal_port}/grants", json={"database": database, "schema": schema, "role": role, "privileges": privileges}, ) res.raise_for_status() return res.json() def metrics(self) -> str: - res = self.get(f"http://localhost:{self.port}/metrics") + res = self.get(f"http://localhost:{self.external_port}/metrics") res.raise_for_status() return res.text @@ -62,5 +64,5 @@ class EndpointHttpClient(requests.Session): } ) - res = self.post(f"http://localhost:{self.port}/failpoints", json=body) + res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body) res.raise_for_status() diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py index 33248132ab..d674be99de 100644 --- a/test_runner/fixtures/fast_import.py +++ b/test_runner/fixtures/fast_import.py @@ -4,8 +4,10 @@ import subprocess import tempfile from collections.abc import Iterator from pathlib import Path +from typing import cast import pytest +from _pytest.config import Config from fixtures.log_helper import log from fixtures.neon_cli import AbstractNeonCli @@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli): pg_distrib_dir: Path, pg_version: PgVersion, workdir: Path, + cleanup: bool = True, ): if extra_env is None: env_vars = {} @@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli): if not workdir.exists(): raise Exception(f"Working directory '{workdir}' does not exist") self.workdir = workdir + self.cleanup = cleanup + + def run_pgdata( + self, + s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + interactive: bool = False, + ): + return self.run( + "pgdata", + s3prefix=s3prefix, + pg_port=pg_port, + source_connection_string=source_connection_string, + interactive=interactive, + ) + + def run_dump_restore( + self, + s3prefix: str | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, + ): + return self.run( + "dump-restore", + s3prefix=s3prefix, + source_connection_string=source_connection_string, + destination_connection_string=destination_connection_string, + ) def run( self, - pg_port: int, - source_connection_string: str | None = None, + command: str, s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, interactive: bool = False, ) -> subprocess.CompletedProcess[str]: if self.cmd is not None: @@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli): args = [ f"--pg-bin-dir={self.pg_bin}", f"--pg-lib-dir={self.pg_lib}", - f"--pg-port={pg_port}", f"--working-directory={self.workdir}", ] - if source_connection_string is not None: - args.append(f"--source-connection-string={source_connection_string}") if s3prefix is not None: args.append(f"--s3-prefix={s3prefix}") + args.append(command) + if pg_port is not None: + args.append(f"--pg-port={pg_port}") + if source_connection_string is not None: + args.append(f"--source-connection-string={source_connection_string}") + if destination_connection_string is not None: + args.append(f"--destination-connection-string={destination_connection_string}") if interactive: args.append("--interactive") @@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli): return self def __exit__(self, *args): - if self.workdir.exists(): + if self.workdir.exists() and self.cleanup: shutil.rmtree(self.workdir) @@ -87,9 +125,17 @@ def fast_import( test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, + pytestconfig: Config, ) -> Iterator[FastImport]: - workdir = Path(tempfile.mkdtemp()) - with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi: + workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_")) + with FastImport( + None, + neon_binpath, + pg_distrib_dir, + pg_version, + workdir, + cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")), + ) as fi: yield fi if fi.cmd is None: diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 33d422c590..97a5a36814 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -478,7 +478,8 @@ class NeonLocalCli(AbstractNeonCli): self, branch_name: str, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, tenant_id: TenantId, pg_version: PgVersion, endpoint_id: str | None = None, @@ -486,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli): lsn: Lsn | None = None, pageserver_id: int | None = None, allow_multiple=False, + update_catalog: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -501,8 +503,10 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--lsn", str(lsn)]) if pg_port is not None: args.extend(["--pg-port", str(pg_port)]) - if http_port is not None: - args.extend(["--http-port", str(http_port)]) + if external_http_port is not None: + args.extend(["--external-http-port", str(external_http_port)]) + if internal_http_port is not None: + args.extend(["--internal-http-port", str(internal_http_port)]) if endpoint_id is not None: args.append(endpoint_id) if hot_standby: @@ -511,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if update_catalog: + args.extend(["--update-catalog"]) res = self.raw_cli(args) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7c4991ffab..12b096a2a0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3,6 +3,7 @@ from __future__ import annotations import abc import asyncio import concurrent.futures +import dataclasses import filecmp import json import os @@ -26,6 +27,7 @@ from urllib.parse import quote, urlparse import asyncpg import backoff +import boto3 import httpx import psycopg2 import psycopg2.sql @@ -36,6 +38,8 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from jwcrypto import jwk +from mypy_boto3_kms import KMSClient +from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -198,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: mock_s3_server.kill() +@pytest.fixture(scope="session") +def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]: + yield boto3.client( + "kms", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + +@pytest.fixture(scope="session") +def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]: + yield boto3.client( + "s3", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + class PgProtocol: """Reusable connection logic""" @@ -463,6 +491,7 @@ class NeonEnvBuilder: self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: assert ( self.compatibility_neon_binpath is not None @@ -674,6 +703,11 @@ class NeonEnvBuilder: def _mix_versions(self): assert self.version_combination is not None, "version combination must be set" + + # Always use a newer version of `neon_local` + (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local") + self.neon_local_binpath = self.mixdir + for component, paths in COMPONENT_BINARIES.items(): directory = ( self.neon_binpath @@ -682,10 +716,11 @@ class NeonEnvBuilder: ) for filename in paths: destination = self.mixdir / filename - destination.symlink_to(directory / filename) + destination.hardlink_to(directory / filename) + self.neon_binpath = self.mixdir + if self.version_combination["compute"] == "old": self.pg_distrib_dir = self.compatibility_pg_distrib_dir - self.neon_binpath = self.mixdir def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ @@ -1675,6 +1710,12 @@ class StorageControllerLeadershipStatus(StrEnum): CANDIDATE = "candidate" +@dataclass +class StorageControllerMigrationConfig: + secondary_warmup_timeout: str | None + secondary_download_request_timeout: str | None + + class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env @@ -2068,11 +2109,20 @@ class NeonStorageController(MetricsGetter, LogUtils): shards: list[TenantShardId] = body["new_shards"] return shards - def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): + def tenant_shard_migrate( + self, + tenant_shard_id: TenantShardId, + dest_ps_id: int, + config: StorageControllerMigrationConfig | None = None, + ): + payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id} + if config is not None: + payload["migration_config"] = dataclasses.asdict(config) + self.request( "PUT", f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", - json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + json=payload, headers=self.headers(TokenScope.ADMIN), ) log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") @@ -2417,6 +2467,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + def __enter__(self) -> Self: return self @@ -3345,7 +3403,7 @@ class NeonProxy(PgProtocol): metric_collection_interval: str | None = None, ): host = "127.0.0.1" - domain = "proxy.localtest.me" # resolves to 127.0.0.1 + domain = "proxy.local.neon.build" # resolves to 127.0.0.1 super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port) self.domain = domain @@ -3368,7 +3426,7 @@ class NeonProxy(PgProtocol): # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("*.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3569,7 +3627,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1 + self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3586,7 +3644,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3807,7 +3865,8 @@ class Endpoint(PgProtocol, LogUtils): env: NeonEnv, tenant_id: TenantId, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, check_stop_result: bool = True, ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") @@ -3817,7 +3876,8 @@ class Endpoint(PgProtocol, LogUtils): self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port - self.http_port = http_port + self.external_http_port = external_http_port + self.internal_http_port = internal_http_port self.check_stop_result = check_stop_result # passed to endpoint create and endpoint reconfigure self.active_safekeepers: list[int] = list(map(lambda sk: sk.id, env.safekeepers)) @@ -3834,7 +3894,8 @@ class Endpoint(PgProtocol, LogUtils): self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( - port=self.http_port, + external_port=self.external_http_port, + internal_port=self.internal_http_port, ) def create( @@ -3846,6 +3907,7 @@ class Endpoint(PgProtocol, LogUtils): config_lines: list[str] | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + update_catalog: bool = False, ) -> Self: """ Create a new Postgres endpoint. @@ -3866,10 +3928,12 @@ class Endpoint(PgProtocol, LogUtils): lsn=lsn, hot_standby=hot_standby, pg_port=self.pg_port, - http_port=self.http_port, + external_http_port=self.external_http_port, + internal_http_port=self.internal_http_port, pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + update_catalog=update_catalog, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -4258,7 +4322,8 @@ class EndpointFactory: self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) self.num_instances += 1 self.endpoints.append(ep) @@ -4283,12 +4348,14 @@ class EndpointFactory: hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, + update_catalog: bool = False, ) -> Endpoint: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) endpoint_id = endpoint_id or self.env.generate_endpoint_id() @@ -4303,6 +4370,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, + update_catalog=update_catalog, ) def stop_all(self, fail_on_error=True) -> Self: @@ -4962,8 +5030,13 @@ def check_restored_datadir_content( restored_files = list_files_to_compare(restored_dir_path) + # pg_notify files are always ignored + pgdata_files = [f for f in pgdata_files if not f.startswith("pg_notify")] + restored_files = [f for f in restored_files if not f.startswith("pg_notify")] + + # pg_xact and pg_multixact files are optional in basebackup: depending on our configuration they + # may be omitted and loaded on demand. if pgdata_files != restored_files: - # filter pg_xact and multixact files which are downloaded on demand pgdata_files = [ f for f in pgdata_files @@ -5122,12 +5195,14 @@ def wait_for_last_flush_lsn( timeline: TimelineId, pageserver_id: int | None = None, auth_token: str | None = None, + last_flush_lsn: Lsn | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" shards = tenant_get_shards(env, tenant, pageserver_id) - last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + if last_flush_lsn is None: + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index d969971a35..4df2b2df2b 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,18 +282,35 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: + """ + Gets the latest generation key from a list of keys. + + @param index_keys: A list of keys of different generations, which start with `prefix` + """ + + def parse_gen(key: str) -> int: + shortname = key.split("/")[-1] + generation_str = shortname.removeprefix(prefix).removesuffix(suffix) + try: + return int(generation_str, base=16) + except ValueError: + log.info(f"Ignoring non-matching key: {key}") + return -1 + + if len(keys) == 0: + raise IndexError("No keys found") + + return max(keys, key=parse_gen) + def get_latest_index_key(self, index_keys: list[str]) -> str: """ Gets the latest index file key. @param index_keys: A list of index keys of different generations. """ - - def parse_gen(index_key: str) -> int: - parts = index_key.split("index_part.json-") - return int(parts[-1], base=16) if len(parts) == 2 else -1 - - return max(index_keys, key=parse_gen) + key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys) + return key def download_index_part(self, index_key: str) -> IndexPartDump: """ @@ -306,6 +323,29 @@ class S3Storage: log.info(f"index_part.json: {body}") return IndexPartDump.from_json(json.loads(body)) + def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None: + tenant_prefix = self.tenant_path(tenant_id) + + objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[ + "Contents" + ] + keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1] + try: + manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys) + except IndexError: + log.info( + f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet" + ) + return None + + response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key) + body = response["Body"].read().decode("utf-8") + log.info(f"Downloaded manifest {manifest_key}: {body}") + + manifest = json.loads(body) + assert isinstance(manifest, dict) + return manifest + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index e160c617cd..2a59eab710 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -52,11 +52,11 @@ COMPONENT_BINARIES = { # Disable auto-formatting for better readability # fmt: off VERSIONS_COMBINATIONS = ( - {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo ) # fmt: on @@ -64,6 +64,8 @@ VERSIONS_COMBINATIONS = ( # If it is not set or set to a value not equal to "false", LFC is enabled by default. USE_LFC = os.environ.get("USE_LFC") != "false" +WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled" + def subprocess_capture( capture_dir: Path, diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index eea0ec2b95..1947a9c3fb 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,8 @@ class Workload: self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + self._configured_pageserver: int | None = None + def branch( self, timeline_id: TimelineId, @@ -92,8 +94,12 @@ class Workload: **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) + self._configured_pageserver = pageserver_id else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + if self._configured_pageserver != pageserver_id: + self._configured_pageserver = pageserver_id + self._endpoint.reconfigure(pageserver_id=pageserver_id) + self._endpoint_config = pageserver_id connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -122,6 +128,7 @@ class Workload: def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) + start = self.expect_rows end = start + n - 1 self.expect_rows += n diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py index d3118eb15a..b55cb68b64 100644 --- a/test_runner/performance/test_ingest_logical_message.py +++ b/test_runner/performance/test_ingest_logical_message.py @@ -76,6 +76,9 @@ def test_ingest_logical_message( log.info("Waiting for Pageserver to catch up") wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn) + recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + endpoint.stop() + # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will # reingest all the WAL from the safekeeper without any other constraints. This gives us a # baseline of how fast the pageserver can ingest this WAL in isolation. @@ -88,7 +91,13 @@ def test_ingest_logical_message( with zenbenchmark.record_duration("pageserver_recover_ingest"): log.info("Recovering WAL into pageserver") client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + wait_for_last_flush_lsn( + env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn + ) + + # Check endpoint can start, i.e. we really recovered + endpoint.start() + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) # Emit metrics. wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py index 704073fe3b..3bf3ef890f 100644 --- a/test_runner/performance/test_lazy_startup.py +++ b/test_runner/performance/test_lazy_startup.py @@ -79,7 +79,9 @@ def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: assert sum == 1000000 # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index d051717e92..60d8b5be30 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -56,7 +56,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc endpoint.safe_psql("select 1;") # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{i}_wait_for_spec", "sync_safekeepers_ms": f"{i}_sync_safekeepers", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index a4b9eabf8e..07600dd911 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -140,9 +140,11 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_period": "1h", "compaction_threshold": 13, "compaction_upper_limit": 100, + "compaction_l0_first": False, + "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": True, + "l0_flush_wait_upload": False, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", @@ -175,7 +177,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "image_layer_creation_check_threshold": 1, "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", - "timeline_offloading": True, + "timeline_offloading": False, "wal_receiver_protocol_override": { "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f3347b594e..c091cd0869 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) @@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent( workload.churn_rows(row_count, env.pageserver.id) env.create_branch("child_branch") # so that we have a retain_lsn workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch_2") # so that we have another retain_lsn + workload.churn_rows(row_count, env.pageserver.id) # compact 3 times if mode is before_restart n_compactions = 3 if compaction_mode == "before_restart" else 1 ps_http.timeline_compact( @@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) @@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent( body={ "scheduled": True, "sub_compaction": True, - "compact_key_range": { - "start": "000000000000000000000000000000000000", - "end": "030000000000000000000000000000000000", - }, "sub_compaction_max_job_size_mb": 16, }, ) wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively, # and the second one should have hit the duplicated layer key warning. @@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" - ) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) @@ -689,9 +677,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) # Write some data to trigger compaction - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) + workload.write_rows(32768, upload=False) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ba3078d493..823f2185e4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -474,6 +474,14 @@ HISTORIC_DATA_SETS = [ PgVersion.V16, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", ), + # This dataset created on a pageserver running modern code at time of capture, but configured with no generation. This + # is our regression test that we can load data written without generations in layer file names & indices + HistoricDataSet( + "2025-02-07-nogenerations", + TenantId("e1411ca6562d6ff62419f693a5695d67"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", + ), ] diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 50a922a616..3a08671bbf 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -82,7 +82,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ddl = client.database_schema(database=test_db["name"]) # Check that it looks like a valid PostgreSQL dump - assert "-- PostgreSQL database dump" in ddl + assert "-- PostgreSQL database dump complete" in ddl # Check that it doesn't contain health_check and migration traces. # They are only created in system `postgres` database, so by checking diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py new file mode 100644 index 0000000000..6619548811 --- /dev/null +++ b/test_runner/regress/test_compute_reconfigure.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +def test_compute_reconfigure(neon_simple_env: NeonEnv): + """ + Test that we can change postgresql.conf settings even if + skip_pg_catalog_updates=True is set. + """ + env = neon_simple_env + + TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: " + + endpoint = env.endpoints.create_start("main") + + # Check that the log line prefix is not set + # or different from TEST_LOG_LINE_PREFIX + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] != TEST_LOG_LINE_PREFIX + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "log_line_prefix", + "vartype": "string", + "value": TEST_LOG_LINE_PREFIX, + } + ] + }, + } + ) + endpoint.reconfigure() + + # Check that in logs we see that it was actually reconfigured, + # not restarted or something else. + endpoint.log_contains("INFO request{method=POST uri=/configure") + + # In /configure we only send SIGHUP at the end, so in theory + # it doesn't necessarily mean that Postgres already reloaded + # the new config; and it may race in some envs. + # So we wait until we see the log line that the config was changed. + def check_logs(): + endpoint.log_contains( + f'[test_compute_reconfigure]: LOG: parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"' + ) + + wait_until(check_logs) + + # Check that the log line prefix is set + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] == TEST_LOG_LINE_PREFIX diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index 0217cd0d03..03bfd1cb8d 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -2,6 +2,8 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import PgVersion +from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres @pytest.mark.parametrize( @@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") with pytest.raises(Exception, match="This probably means the server terminated abnormally"): endpoint.safe_psql(f"SELECT {sql_func}();") + + +@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only") +def test_sanitizers(neon_env_builder: NeonEnvBuilder): + """ + Test that undefined behavior leads to endpoint abort with sanitizers enabled + """ + env = neon_env_builder.init_start() + env.create_branch("test_ubsan") + endpoint = env.endpoints.create_start("test_ubsan") + + # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org + if not WITH_SANITIZERS: + endpoint.safe_psql("SELECT 1::int4 << 128") + else: + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql("SELECT 1::int4 << 128") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 6b35f3c6bb..71e0d16edd 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -1,7 +1,9 @@ +import base64 import json import re import time from enum import Enum +from pathlib import Path import psycopg2 import psycopg2.errors @@ -14,8 +16,12 @@ from fixtures.pageserver.http import ( ImportPgdataIdemptencyKey, PageserverApiException, ) +from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from mypy_boto3_kms import KMSClient +from mypy_boto3_kms.type_defs import EncryptResponseTypeDef +from mypy_boto3_s3 import S3Client from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -103,13 +109,15 @@ def test_pgdata_import_smoke( while True: relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") log.info( - f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages" ) if relblock_size >= target_relblock_size: break addrows = int((target_relblock_size - relblock_size) // 8192) assert addrows >= 1, "forward progress" - vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + vanilla_pg.safe_psql( + f"insert into t select generate_series({nrows + 1}, {nrows + addrows})" + ) nrows += addrows expect_nrows = nrows expect_sum = ( @@ -231,14 +239,14 @@ def test_pgdata_import_smoke( shard_zero_http = shard_zero_ps.http_client() shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) - latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + min_readable_lsn = Lsn(shard_zero_timeline_info["min_readable_lsn"]) last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` assert remote_consistent_lsn_visible == disk_consistent_lsn - assert initdb_lsn == latest_gc_cutoff_lsn + assert initdb_lsn == min_readable_lsn assert disk_consistent_lsn == initdb_lsn + 8 assert last_record_lsn == disk_consistent_lsn # TODO: assert these values are the same everywhere @@ -332,6 +340,224 @@ def test_pgdata_import_smoke( br_initdb_endpoint.safe_psql("select * from othertable") +def test_fast_import_with_pageserver_ingest( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, + neon_env_builder: NeonEnvBuilder, + make_httpserver: HTTPServer, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Setup pageserver and fake cplane for import progress + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api", + # because import_pgdata code uses this endpoint, not the one in common remote storage config + # TODO: maybe use common remote_storage config in pageserver? + "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(), + } + ) + env.pageserver.stop() + env.pageserver.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "project_id": "someproject", + "branch_id": "somebranch", + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Create timeline with import_pgdata + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": { + "AwsS3": { + "region": env.s3_mock_server.region(), + "bucket": bucket, + "key": key_prefix, + } + }, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + pg_port = port_distributor.get_port() + fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") + vanilla_pg.stop() + + def validate_vanilla_equivalence(ep): + res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb") + assert res[0] == (10, 55), f"got result: {res}" + + # Sanity check that data in pgdata is expected: + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + validate_vanilla_equivalence(conn) + + # Poll pageserver statuses in s3 + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}") + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + if state == "Active": + key = f"{key_prefix}/status/shard-{shard_id.shard_index}" + shard_status_file_contents = ( + mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8") + ) + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(0.5) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id) + + # check that data is there + validate_vanilla_equivalence(ep) + + # check that we can do basic ops + + ep.safe_psql("create table othertable(values text)", dbname="neondb") + rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + ep.stop() + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_tip_endpoint.stop() + + # ... at the initdb lsn + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero["shard_id"], timeline_id + ) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_initdb_endpoint.stop() + + env.pageserver.stop(immediate=True) + + def test_fast_import_binary( test_output_dir, vanilla_pg: VanillaPostgres, @@ -342,7 +568,7 @@ def test_fast_import_binary( vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") pg_port = port_distributor.get_port() - fast_import.run(pg_port, vanilla_pg.connstr()) + fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) vanilla_pg.stop() pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) @@ -358,6 +584,118 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_restore_to_connstring( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # create another database & role and try to restore there + destination_vanilla_pg.safe_psql(""" + CREATE ROLE testrole WITH + LOGIN + PASSWORD 'testpassword' + NOSUPERUSER + NOCREATEDB + NOCREATEROLE; + """) + destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;") + + destination_connstring = destination_vanilla_pg.connstr( + dbname="testdb", user="testrole", password="testpassword" + ) + fast_import.run_dump_restore( + source_connection_string=vanilla_pg.connstr(), + destination_connection_string=destination_connstring, + ) + vanilla_pg.stop() + conn = PgProtocol(dsn=destination_connstring) + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +def test_fast_import_restore_to_connstring_from_s3_spec( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Start target postgres + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "destination_connstring_ciphertext_base64": base64.b64encode( + destination_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + } + + mock_s3_client.create_bucket(Bucket="test-bucket") + mock_s3_client.put_object( + Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec) + ) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix") + vanilla_pg.stop() + + res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + # TODO: Maybe test with pageserver? # 1. run whole neon env # 2. create timeline with some s3 path??? diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 8762e6525b..ea7d38a3d9 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): thread.join() + # Fill LFC: seqscan should fetch the whole table in cache. + # It is needed for further correct evaluation of LFC file size + # (a sparse chunk of LFC takes less than 1 MB on disk). + cur.execute("select sum(abalance) from pgbench_accounts") + # Before shrinking the cache, check that it really is large now (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert int(lfc_file_blocks) > 128 * 1024 diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 80e26d9432..8d9aab6848 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -17,11 +17,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por main_branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( main_branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep-basic-main", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -35,11 +37,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por new_branch_name=branch_name, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id=f"ep-{branch_name}", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -59,23 +63,27 @@ def test_neon_two_primary_endpoints_fail( branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep1", tenant_id=env.initial_tenant, pg_version=env.pg_version, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() # ep1 is not running so create will succeed env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep2", tenant_id=env.initial_tenant, pg_version=env.pg_version, diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 7e5bb45242..fa1cd61206 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -12,7 +12,6 @@ of the pageserver are: from __future__ import annotations import os -import re import time from enum import StrEnum @@ -29,7 +28,6 @@ from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, - list_prefix, wait_for_last_record_lsn, wait_for_upload, ) @@ -124,109 +122,6 @@ def assert_deletion_queue(ps_http, size_fn) -> None: assert size_fn(v) is True -def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): - """ - Validate behavior when a pageserver is run without generation support enabled, - then started again after activating it: - - Before upgrade, no objects should have generation suffixes - - After upgrade, the bucket should contain a mixture. - - In both cases, postgres I/O should work. - """ - neon_env_builder.enable_pageserver_remote_storage( - RemoteStorageKind.MOCK_S3, - ) - - env = neon_env_builder.init_configs() - env.broker.start() - for sk in env.safekeepers: - sk.start() - env.storage_controller.start() - - # We will start a pageserver with no control_plane_api set, so it won't be able to self-register - env.storage_controller.node_register(env.pageserver) - - def remove_control_plane_api_field(config): - return config.pop("control_plane_api") - - control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) - env.pageserver.start() - env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) - - env.create_tenant( - tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline - ) - - generate_uploads_and_deletions(env, pageserver=env.pageserver) - - def parse_generation_suffix(key): - m = re.match(".+-([0-9a-zA-Z]{8})$", key) - if m is None: - return None - else: - log.info(f"match: {m}") - log.info(f"group: {m.group(1)}") - return int(m.group(1), 16) - - assert neon_env_builder.pageserver_remote_storage is not None - pre_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in pre_upgrade_keys: - assert parse_generation_suffix(key) is None - - env.pageserver.stop() - # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": control_plane_api, - } - ) - env.pageserver.start() - - generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) - - legacy_objects: list[str] = [] - suffixed_objects = [] - post_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in post_upgrade_keys: - log.info(f"post-upgrade key: {key}") - if parse_generation_suffix(key) is not None: - suffixed_objects.append(key) - else: - legacy_objects.append(key) - - # Bucket now contains a mixture of suffixed and non-suffixed objects - assert len(suffixed_objects) > 0 - assert len(legacy_objects) > 0 - - # Flush through deletions to get a clean state for scrub: we are implicitly validating - # that our generations-enabled pageserver was able to do deletions of layers - # from earlier which don't have a generation. - env.pageserver.http_client().deletion_queue_flush(execute=True) - - assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0 - - # Having written a mixture of generation-aware and legacy index_part.json, - # ensure the scrubber handles the situation as expected. - healthy, metadata_summary = env.storage_scrubber.scan_metadata() - assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline - assert metadata_summary["timeline_count"] == 1 - assert metadata_summary["timeline_shard_count"] == 1 - assert healthy - - def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 835ccbd5d4..21cb780c06 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -3,6 +3,7 @@ from __future__ import annotations import random from contextlib import closing +import psycopg2.errors as pgerr import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -226,3 +227,43 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | N # so instead, do a fast shutdown for this one test. # See https://github.com/neondatabase/neon/issues/8709 env.stop(immediate=True) + + +def test_pageserver_lost_and_transaction_aborted(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction abort and target relation is + not present in cache, we abort the transaction in ABORT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(b box)") + env.pageserver.stop() + cur.execute("create index ti on t using gist(b)") + + +def test_pageserver_lost_and_transaction_committed(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction commit and target relation is + not present in cache, we abort the transaction in COMMIT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(t boolean)") + env.pageserver.stop() + cur.execute("drop table t") diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 590093d23c..aa375604f4 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -10,14 +10,18 @@ from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserver, + StorageControllerMigrationConfig, +) from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage -from fixtures.utils import skip_in_debug_build, wait_until +from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until from fixtures.workload import Workload from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -889,3 +893,103 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll assert progress_3["heatmap_mtime"] is not None assert progress_3["layers_total"] == progress_3["layers_downloaded"] assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] + + +@skip_in_debug_build("only run with release build") +@run_only_on_default_postgres("PG version is not interesting here") +def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + + env = neon_env_builder.init_configs() + env.start() + + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}') + + env.storage_controller.reconcile_until_idle() + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.stop() + + # Expect lots of layers + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. + # However, it pulls the heatmap, which will be important later. + http_client = env.storage_controller.pageserver_api() + (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress["heatmap_mtime"] is not None + assert progress["layers_downloaded"] > 0 + assert progress["bytes_downloaded"] > 0 + assert progress["layers_total"] > progress["layers_downloaded"] + assert progress["bytes_total"] > progress["bytes_downloaded"] + + env.storage_controller.allowed_errors.extend( + [ + ".*Timed out.*downloading layers.*", + ] + ) + + # Use a custom configuration that gives up earlier than usual. + # We can't hydrate everything anyway because of the failpoints. + config = StorageControllerMigrationConfig( + secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config + ) + + env.storage_controller.reconcile_until_idle() + assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id) + + assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0 + + # The new layer map should contain all the layers in the pre-migration one + # and a new in memory layer + after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"]) + assert ( + len(heatmap_before_migration["timelines"][0]["layers"]) + 1 + == after_migration_heatmap_layers_count + ) + + log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + + def all_layers_downloaded(): + local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count == after_migration_heatmap_layers_count + + wait_until(all_layers_downloaded) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index c5ae669dce..411888efbc 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -261,7 +261,7 @@ def test_isolation( pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) # This fails with a mismatch on `pg_multixact/offsets/0000` - # post_checks(env, test_output_dir, DBNAME, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run extra Neon-specific pg_regress-based tests. The tests and their diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index d8df2efc78..3c7fd0b897 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -57,7 +57,7 @@ def test_proxy_select_1(static_proxy: NeonProxy): assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me") + out = static_proxy.safe_psql("select 42", host="generic-project-name.local.neon.build") assert out[0][0] == 42 @@ -234,7 +234,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( - f"https://api.localtest.me:{static_proxy.external_http_port}/sql", + f"https://api.local.neon.build:{static_proxy.external_http_port}/sql", data=json.dumps({"query": "select 42 as answer", "params": []}), headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, verify=str(static_proxy.test_output_dir / "proxy.crt"), diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index 902da1942e..c59da8c6b0 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -35,7 +35,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project") # with SNI - check_cannot_connect(query="select 1", host="private-project.localtest.me") + check_cannot_connect(query="select 1", host="private-project.local.neon.build") # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project") @@ -46,7 +46,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me") + out = static_proxy.safe_psql(query="select 1", host="generic-project.local.neon.build") assert out[0][0] == 1 diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index c13bea7ee1..fe970a868c 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -287,7 +287,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): offset=offset, ) - # Do some update so we can increment latest_gc_cutoff + # Do some update so we can increment gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) # Wait for the existing lease to expire. diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py new file mode 100644 index 0000000000..3e29c92a96 --- /dev/null +++ b/test_runner/regress/test_relations.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) + + +def test_pageserver_reldir_v2( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "false", + } + ) + + endpoint = env.endpoints.create_start("main") + # Create a relation in v1 + endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + + # Switch to v2 + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": True, + }, + ) + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Restart the endpoint + endpoint.stop() + endpoint.start() + + # Check if both relations are still accessible again after restart + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Create a relation in v2 + endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + # Delete a relation in v1 + endpoint.safe_psql("DROP TABLE foo1") + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + # Restart the endpoint + endpoint.stop() + # This will acquire a basebackup, which lists all relations. + endpoint.start() + + # Check if both relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + endpoint.safe_psql("DROP TABLE foo3") + endpoint.stop() + endpoint.start() + + # Check if relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("DROP TABLE IF EXISTS foo3") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 86a6b7428b..f58bbcd3c0 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -91,7 +91,7 @@ def test_sharding_smoke( workload.init() sizes_before = get_sizes() - workload.write_rows(256) + workload.write_rows(65536) # Test that we can read data back from a sharded tenant workload.validate() @@ -1368,6 +1368,7 @@ def test_sharding_split_failures( workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(100) + compute_reconfigure_listener.register_workload(workload) # Put the environment into a failing state (exact meaning depends on `failure`) failure.apply(env) @@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", ], + # We need `neon` extension for calling backpressure functions, + # this flag instructs `compute_ctl` to pre-install it. + "update_catalog": True, }, ) workload.init() @@ -1810,3 +1814,14 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn + + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key.*", + ".*could not ingest record.*", + ] + ) diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 2a26fef59a..3487542d6e 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -116,7 +116,7 @@ def test_pg_sni_router( test_output_dir: Path, ): generate_tls_cert( - "endpoint.namespace.localtest.me", + "endpoint.namespace.local.neon.build", test_output_dir / "router.crt", test_output_dir / "router.key", ) @@ -130,7 +130,7 @@ def test_pg_sni_router( with PgSniRouter( neon_binpath=neon_binpath, port=router_port, - destination="localtest.me", + destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", test_output_dir=test_output_dir, @@ -141,7 +141,7 @@ def test_pg_sni_router( "select 1", dbname="postgres", sslmode="require", - host=f"endpoint--namespace--{pg_port}.localtest.me", + host=f"endpoint--namespace--{pg_port}.local.neon.build", hostaddr="127.0.0.1", ) assert out[0][0] == 1 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 11a4d09202..88d30308f7 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2139,12 +2139,18 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("num_azs", [1, 2]) +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before restarting. In practice, Ansible will drive this process. + + Test is parametrized on the number of AZs to exercise the AZ-driven behavior + of reliably moving shards back to their home AZ, and the behavior for AZ-agnostic + tenants where we fill based on a target shard count. """ + neon_env_builder.num_azs = num_azs neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -2174,8 +2180,15 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): min_shard_count = min(shard_counts.values()) max_shard_count = max(shard_counts.values()) - flake_factor = 5 / 100 - assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + if num_azs == 1: + # AZ-agnostic case: we expect all nodes to have the same number of shards, within some bound + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + else: + # AZ-driven case: we expect tenants to have been round-robin allocated to AZs, + # and after the restart they should all be back in their home AZ, so difference + # should be at most a single shard's tenants + assert max_shard_count - min_shard_count <= shard_count_per_tenant # Perform a graceful rolling restart for ps in env.pageservers: @@ -3176,15 +3189,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert len(target.get_safekeepers()) == 0 + sk_0 = env.safekeepers[0] + body = { "active": True, "id": fake_id, "created_at": "2023-10-25T09:11:25Z", "updated_at": "2024-08-28T11:32:43Z", "region_id": "aws-us-east-2", - "host": "safekeeper-333.us-east-2.aws.neon.build", - "port": 6401, - "http_port": 7676, + "host": "localhost", + "port": sk_0.port.pg, + "http_port": sk_0.port.http, "version": 5957, "availability_zone_id": "us-east-2b", } @@ -3230,6 +3245,13 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): # Ensure idempotency target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 0f4e5688a9..d44c176b35 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -312,6 +312,17 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key.*", + ".*could not ingest record.*", + ] + ) + def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index b4c968b217..afe444f227 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): counts = timeline_detail["directory_entries_counts"] assert counts log.info(f"directory counts: {counts}") - assert counts[2] > COUNT_AT_LEAST_EXPECTED + # We need to add up reldir v1 + v2 counts + assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 306e971657..2706ddf2f0 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -139,9 +139,9 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @pytest.mark.parametrize("manual_offload", [False, True]) def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): - if not manual_offload: - # (automatic) timeline offloading defaults to false for now - neon_env_builder.pageserver_config_override = "timeline_offloading = true" + if manual_offload: + # (automatic) timeline offloading defaults to true + neon_env_builder.pageserver_config_override = "timeline_offloading = false" env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() @@ -396,8 +396,6 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): with tenant migrations and timeline deletions. """ - # Offloading is off by default at time of writing: remove this line when it's on by default - neon_env_builder.pageserver_config_override = "timeline_offloading = true" neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"} neon_env_builder.enable_pageserver_remote_storage(s3_storage()) @@ -554,8 +552,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): log.info(f"Timeline {state.timeline_id} is still active") shutdown.wait(0.5) elif state.timeline_id in offloaded_ids: - log.info(f"Timeline {state.timeline_id} is now offloaded") - state.offloaded = True + log.info(f"Timeline {state.timeline_id} is now offloaded in memory") + + # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload + # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest). + # So we wait until we see the manifest update before considering it offloaded, that way + # subsequent checks that it doesn't revert to active on a restart will pass reliably. + time.sleep(0.1) + assert isinstance(env.pageserver_remote_storage, S3Storage) + manifest = env.pageserver_remote_storage.download_tenant_manifest( + tenant_id + ) + if manifest is None: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)" + ) + elif str(state.timeline_id) in [ + t["timeline_id"] for t in manifest["offloaded_timelines"] + ]: + log.info( + f"Timeline {state.timeline_id} is now offloaded persistently" + ) + state.offloaded = True + else: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})" + ) + break else: # Timeline is neither offloaded nor active, this is unexpected: the pageserver @@ -969,8 +992,6 @@ def test_timeline_offload_race_unarchive( Ensure that unarchive and timeline offload don't race each other """ # Regression test for issue https://github.com/neondatabase/neon/issues/10220 - # (automatic) timeline offloading defaults to false for now - neon_env_builder.pageserver_config_override = "timeline_offloading = true" failpoint = "before-timeline-auto-offload" diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2b6a267bdf..c5045fe4a4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -566,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): +# This test is flaky, probably because PUTs of local fs storage are not atomic. +# Let's keep both remote storage kinds for a while to see if this is the case. +# https://github.com/neondatabase/neon/issues/10761 +@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS]) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -1441,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # roughly fills one segment endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'") + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) endpoint.stop() # stop compute @@ -1469,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): "flush_lsn to get aligned", ) - cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) + sk1_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + sk2_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + assert sk1_digest == sk2_digest # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b32b028fa1..936c774657 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -268,7 +268,8 @@ def endpoint_create_start( env, tenant_id=env.initial_tenant, pg_port=env.port_distributor.get_port(), - http_port=env.port_distributor.get_port(), + external_http_port=env.port_distributor.get_port(), + internal_http_port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. check_stop_result=False, diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py index facdb19140..069852468d 100755 --- a/test_runner/websocket_tunnel.py +++ b/test_runner/websocket_tunnel.py @@ -13,12 +13,12 @@ # postgres -D data -p3000 # # ## Launch proxy with WSS enabled: -# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me' +# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.local.neon.build' # ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres # # ## Launch the tunnel: # -# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me" +# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.local.neon.build" # # ## Now you can connect with psql: # psql "postgresql://heikki@localhost:40433/postgres" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c0aedfd3ca..6254ab9b44 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679 +Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 355a7c69d3..81e2eef061 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59 +Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 86d9ea96eb..9422247c58 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c +Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 8dfd5a7030..a8fea8b4be 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3 +Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8 diff --git a/vendor/revisions.json b/vendor/revisions.json index efddaef46a..72d97d7f6a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.2", - "8dfd5a7030d3e8a98b60265ebe045788892ac7f3" + "17.4", + "a8fea8b4be43039f0782347c88a9b9b25f50c9d8" ], "v16": [ - "16.6", - "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c" + "16.8", + "9422247c582e7c1a08a4855d04af0874f8df2f34" ], "v15": [ - "15.10", - "355a7c69d3f907f3612eb406cc7b9c2f55d59b59" + "15.12", + "81e2eef0616c65c2233c75b06f25766ae4c080c4" ], "v14": [ - "14.15", - "c0aedfd3cac447510a2db843b561f0c52901b679" + "14.17", + "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 2c65401154..1b7c376560 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -42,7 +42,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } -hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } +hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } @@ -94,6 +94,7 @@ tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } url = { version = "2", features = ["serde"] } +uuid = { version = "1", features = ["serde", "v4", "v7"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" }