diff --git a/.dockerignore b/.dockerignore index 9e2d2e7108..7ead48db7c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -24,3 +24,4 @@ !storage_controller/ !vendor/postgres-*/ !workspace_hack/ +!build_tools/patches diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 9a0261d430..0eddfe5da6 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -121,6 +121,8 @@ runs: export DEFAULT_PG_VERSION=${PG_VERSION#v} export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} + export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1 + export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1 if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 1dec8106b4..86a791497c 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -23,6 +23,11 @@ on: description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string defaults: run: @@ -87,6 +92,7 @@ jobs: - name: Set env variables env: ARCH: ${{ inputs.arch }} + SANITIZERS: ${{ inputs.sanitizers }} run: | CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then @@ -99,8 +105,14 @@ jobs: cov_prefix="" CARGO_FLAGS="--locked --release" fi + if [[ $SANITIZERS == 'enabled' ]]; then + make_vars="WITH_SANITIZERS=yes" + else + make_vars="" + fi { echo "cov_prefix=${cov_prefix}" + echo "make_vars=${make_vars}" echo "CARGO_FEATURES=${CARGO_FEATURES}" echo "CARGO_FLAGS=${CARGO_FLAGS}" echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" @@ -136,35 +148,39 @@ jobs: - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) + run: mold -run make ${make_vars} postgres-v14 -j$(nproc) - name: Build postgres v15 if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) + run: mold -run make ${make_vars} postgres-v15 -j$(nproc) - name: Build postgres v16 if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) + run: mold -run make ${make_vars} postgres-v16 -j$(nproc) - name: Build postgres v17 if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: mold -run make postgres-v17 -j$(nproc) + run: mold -run make ${make_vars} postgres-v17 -j$(nproc) - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) + run: mold -run make ${make_vars} neon-pg-ext -j$(nproc) - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) + run: mold -run make ${make_vars} walproposer-lib -j$(nproc) - name: Run cargo build + env: + WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }} run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + export ASAN_OPTIONS=detect_leaks=0 + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS} # Do install *before* running rust tests because they might recompile the # binaries with different features/flags. - name: Install rust binaries env: ARCH: ${{ inputs.arch }} + SANITIZERS: ${{ inputs.sanitizers }} run: | # Install target binaries mkdir -p /tmp/neon/bin/ @@ -179,7 +195,7 @@ jobs: done # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then # Keep bloated coverage data files away from the rest of the artifact mkdir -p /tmp/coverage/ @@ -212,6 +228,7 @@ jobs: role-duration-seconds: 18000 # 5 hours - name: Run rust tests + if: ${{ inputs.sanitizers != 'enabled' }} env: NEXTEST_RETRIES: 3 run: | @@ -273,6 +290,7 @@ jobs: DATABASE_URL: postgresql://localhost:1235/storage_controller POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install run: | + export ASAN_OPTIONS=detect_leaks=0 /tmp/neon/bin/neon_local init /tmp/neon/bin/neon_local storage_controller start @@ -319,7 +337,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: 60 + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress @@ -337,6 +355,7 @@ jobs: PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} + SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml deleted file mode 100644 index c304172ff7..0000000000 --- a/.github/workflows/_push-to-acr.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Push images to ACR -on: - workflow_call: - inputs: - client_id: - description: Client ID of Azure managed identity or Entra app - required: true - type: string - image_tag: - description: Tag for the container image - required: true - type: string - images: - description: Images to push - required: true - type: string - registry_name: - description: Name of the container registry - required: true - type: string - subscription_id: - description: Azure subscription ID - required: true - type: string - tenant_id: - description: Azure tenant ID - required: true - type: string - -jobs: - push-to-acr: - runs-on: ubuntu-22.04 - permissions: - contents: read # This is required for actions/checkout - id-token: write # This is required for Azure Login to work. - - steps: - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ inputs.client_id }} - subscription-id: ${{ inputs.subscription_id }} - tenant-id: ${{ inputs.tenant_id }} - - - name: Login to ACR - run: | - az acr login --name=${{ inputs.registry_name }} - - - name: Copy docker images to ACR ${{ inputs.registry_name }} - run: | - images='${{ inputs.images }}' - for image in ${images}; do - docker buildx imagetools create \ - -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ - neondatabase/${image}:${{ inputs.image_tag }} - done diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml new file mode 100644 index 0000000000..3c97c8a67a --- /dev/null +++ b/.github/workflows/_push-to-container-registry.yml @@ -0,0 +1,101 @@ +name: Push images to Container Registry +on: + workflow_call: + inputs: + # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + image-map: + description: JSON map of images, mapping from a source image to an array of target images that should be pushed. + required: true + type: string + aws-region: + description: AWS region to log in to. Required when pushing to ECR. + required: false + type: string + aws-account-ids: + description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + azure-client-id: + description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR. + required: false + type: string + azure-subscription-id: + description: Azure subscription ID. Required when pushing to ACR. + required: false + type: string + azure-tenant-id: + description: Azure tenant ID. Required when pushing to ACR. + required: false + type: string + acr-registry-name: + description: ACR registry name. Required when pushing to ACR. + required: false + type: string + secrets: + docker-hub-username: + description: Docker Hub username. Required when pushing to Docker Hub. + required: false + docker-hub-password: + description: Docker Hub password. Required when pushing to Docker Hub. + required: false + aws-role-to-assume: + description: AWS role to assume. Required when pushing to ECR. + required: false + +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + push-to-container-registry: + runs-on: ubuntu-22.04 + permissions: + id-token: write # Required for aws/azure login + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: scripts/push_with_image_map.py + sparse-checkout-cone-mode: false + + - name: Print image-map + run: echo '${{ inputs.image-map }}' | jq + + - name: Configure AWS credentials + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: "${{ inputs.aws-region }}" + role-to-assume: "${{ secrets.aws-role-to-assume }}" + role-duration-seconds: 3600 + + - name: Login to ECR + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/amazon-ecr-login@v2 + with: + registries: "${{ inputs.aws-account-ids }}" + + - name: Configure Azure credentials + if: contains(inputs.image-map, 'azurecr.io/') + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ inputs.azure-client-id }} + subscription-id: ${{ inputs.azure-subscription-id }} + tenant-id: ${{ inputs.azure-tenant-id }} + + - name: Login to ACR + if: contains(inputs.image-map, 'azurecr.io/') + run: | + az acr login --name=${{ inputs.acr-registry-name }} + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.docker-hub-username }} + password: ${{ secrets.docker-hub-password }} + + - name: Copy docker images to target registries + run: python scripts/push_with_image_map.py + env: + IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index fc2f36c74b..f4e1e2e96c 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -67,9 +67,9 @@ jobs: - uses: actions/checkout@v4 with: - ref: main + ref: ${{ github.event.pull_request.head.sha }} token: ${{ secrets.CI_ACCESS_TOKEN }} - + - name: Look for existing PR id: get-pr env: @@ -77,7 +77,7 @@ jobs: run: | ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT} - + - name: Get changed labels id: get-labels if: steps.get-pr.outputs.ALREADY_CREATED != '' @@ -94,10 +94,6 @@ jobs: echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - - run: git checkout -b "${BRANCH}" - run: git push --force origin "${BRANCH}" @@ -105,7 +101,7 @@ jobs: - name: Create a Pull Request for CI run (if required) if: steps.get-pr.outputs.ALREADY_CREATED == '' - env: + env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md @@ -142,7 +138,7 @@ jobs: - run: git push --force origin "${BRANCH}" if: steps.get-pr.outputs.ALREADY_CREATED != '' - + cleanup: # Close PRs and delete branchs if the original PR is closed. diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5a4bdecb99..88cb395958 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -497,7 +497,7 @@ jobs: trigger-e2e-tests: if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, promote-images-dev, tag ] + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit @@ -571,21 +571,6 @@ jobs: neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Push multi-arch image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }} - compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, tag ] permissions: @@ -632,16 +617,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -729,21 +704,6 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, large ] @@ -876,133 +836,109 @@ jobs: docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down - promote-images-dev: - needs: [ check-permissions, tag, vm-compute-node-image, neon-image ] + generate-image-maps: + needs: [ tag ] runs-on: ubuntu-22.04 - - permissions: - id-token: write # aws-actions/configure-aws-credentials - statuses: write - contents: read - - env: - VERSIONS: v14 v15 v16 v17 - + outputs: + neon-dev: ${{ steps.generate.outputs.neon-dev }} + neon-prod: ${{ steps.generate.outputs.neon-prod }} + compute-dev: ${{ steps.generate.outputs.compute-dev }} + compute-prod: ${{ steps.generate.outputs.compute-prod }} steps: - - uses: docker/login-action@v3 + - uses: actions/checkout@v4 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + sparse-checkout: scripts/generate_image_maps.py + sparse-checkout-cone-mode: false - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 + - name: Generate Image Maps + id: generate + run: python scripts/generate_image_maps.py + env: + BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" + PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - name: Copy vm-compute-node images to ECR - run: | - for version in ${VERSIONS}; do - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - - promote-images-prod: - needs: [ check-permissions, tag, test-images, promote-images-dev ] - runs-on: ubuntu-22.04 - if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - - permissions: - id-token: write # aws-actions/configure-aws-credentials - statuses: write - contents: read - - env: - VERSIONS: v14 v15 v16 v17 - - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 3600 - - - name: Login to Amazon Dev ECR - uses: aws-actions/amazon-ecr-login@v2 - - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - name: Add latest tag to images - if: github.ref_name == 'main' - run: | - for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do - docker buildx imagetools create -t $repo/neon:latest \ - $repo/neon:${{ needs.tag.outputs.build-tag }} - - for version in ${VERSIONS}; do - docker buildx imagetools create -t $repo/compute-node-${version}:latest \ - $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} - - docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ - $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - done - docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - - - name: Configure AWS-prod credentials - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - mask-aws-account-id: true - role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }} - - - name: Login to prod ECR - uses: docker/login-action@v3 - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - with: - registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com - - - name: Copy all images to prod ECR - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - run: | - for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do - docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} - done - - push-to-acr-dev: - if: github.ref_name == 'main' - needs: [ tag, promote-images-dev ] - uses: ./.github/workflows/_push-to-acr.yml + push-neon-image-dev: + needs: [ generate-image-maps, neon-image ] + uses: ./.github/workflows/_push-to-container-registry.yml with: - client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' + aws-region: eu-central-1 + aws-account-ids: "369495373322" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - push-to-acr-prod: + push-compute-image-dev: + needs: [ generate-image-maps, vm-compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' + aws-region: eu-central-1 + aws-account-ids: "369495373322" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + push-neon-image-prod: if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ tag, promote-images-prod ] - uses: ./.github/workflows/_push-to-acr.yml + needs: [ generate-image-maps, neon-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml with: - client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' + aws-region: eu-central-1 + aws-account-ids: "093970136003" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + push-compute-image-prod: + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + needs: [ generate-image-maps, vm-compute-node-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' + aws-region: eu-central-1 + aws-account-ids: "093970136003" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: + aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}" + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + # This is a bit of a special case so we're not using a generated image map. + add-latest-tag-to-neon-extensions-test-image: + if: github.ref_name == 'main' + needs: [ tag, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + } + secrets: + docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] @@ -1084,7 +1020,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() permissions: @@ -1337,7 +1273,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ] + needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: @@ -1362,7 +1298,8 @@ jobs: - check-codestyle-rust - check-dependencies-rust - files-changed - - promote-images-dev + - push-compute-image-dev + - push-neon-image-dev - test-images - trigger-custom-extensions-build-and-wait runs-on: ubuntu-22.04 @@ -1379,6 +1316,7 @@ jobs: || needs.check-codestyle-python.result == 'skipped' || needs.check-codestyle-rust.result == 'skipped' || needs.files-changed.result == 'skipped' - || needs.promote-images-dev.result == 'skipped' + || needs.push-compute-image-dev.result == 'skipped' + || needs.push-neon-image-dev.result == 'skipped' || needs.test-images.result == 'skipped' || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml new file mode 100644 index 0000000000..2bc938509f --- /dev/null +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -0,0 +1,134 @@ +name: Build and Test with Sanitizers + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 1 * * *' # run once a day, timezone is utc + workflow_dispatch: + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + tag: + runs-on: [ self-hosted, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get build tag + run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" + echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT + fi + shell: bash + id: build-tag + + build-build-tools-image: + uses: ./.github/workflows/build-build-tools-image.yml + secrets: inherit + + build-and-test-locally: + needs: [ tag, build-build-tools-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + build-type: [ release ] + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + test-cfg: '[{"pg_version":"v17"}]' + sanitizers: enabled + secrets: inherit + + + create-test-report: + needs: [ build-and-test-locally, build-build-tools-image ] + if: ${{ !cancelled() }} + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} + + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - uses: actions/github-script@v7 + if: ${{ !cancelled() }} + with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + const report = { + reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", + reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", + } + + const coverage = {} + + const script = require("./scripts/comment-test-report.js") + await script({ + github, + context, + fetch, + report, + coverage, + }) diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 31696248b0..27ed1e4cff 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -15,7 +15,14 @@ env: E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + cancel-previous-e2e-tests: + needs: [ check-permissions ] if: github.event_name == 'pull_request' runs-on: ubuntu-22.04 @@ -29,6 +36,7 @@ jobs: --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" tag: + needs: [ check-permissions ] runs-on: ubuntu-22.04 outputs: build-tag: ${{ steps.build-tag.outputs.tag }} @@ -68,7 +76,7 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: Wait for `promote-images-dev` job to finish + - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely timeout-minutes: 60 run: | @@ -79,20 +87,20 @@ jobs: # For PRs we use the run id as the tag BUILD_AND_TEST_RUN_ID=${TAG} while true; do - conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion') - case "$conclusion" in - success) - break - ;; - failure | cancelled | skipped) - echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..." - exit 1 - ;; - *) - echo "The 'promote-images-dev' hasn't succeed yet. Waiting..." - sleep 60 - ;; - esac + gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json + if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then + break + fi + jq -c '.[]' jobs.json | while read -r job; do + case $(echo $job | jq .conclusion) in + failure | cancelled | skipped) + echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..." + exit 1 + ;; + esac + done + echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..." + sleep 60 done - name: Set e2e-platforms diff --git a/Cargo.lock b/Cargo.lock index de1b1218ca..407c8170bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -300,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.5.15" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -311,7 +311,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.60.7", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.4" +version = "1.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" +checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -376,7 +376,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -399,7 +399,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -424,7 +424,7 @@ dependencies = [ "aws-smithy-checksums", "aws-smithy-eventstream", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -447,15 +447,15 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.57.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38" +checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -469,15 +469,15 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.58.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c" +checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -491,15 +491,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.58.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962" +checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -514,9 +514,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.7" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05" +checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -543,9 +543,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -575,9 +575,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.6" +version = "0.60.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" dependencies = [ "aws-smithy-types", "bytes", @@ -586,9 +586,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.12" +version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -607,9 +607,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.2" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-json" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" dependencies = [ "aws-smithy-types", ] @@ -626,9 +635,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.7" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" +checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -670,9 +679,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.12" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ "base64-simd", "bytes", @@ -705,9 +714,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.4" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -777,7 +786,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" dependencies = [ "async-trait", "base64 0.22.1", @@ -806,7 +815,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" dependencies = [ "async-lock", "async-trait", @@ -825,7 +834,7 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" dependencies = [ "RustyXML", "async-lock", @@ -843,7 +852,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" dependencies = [ "RustyXML", "azure_core", @@ -863,7 +872,7 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541" dependencies = [ "azure_core", "bytes", @@ -1424,6 +1433,7 @@ dependencies = [ "comfy-table", "compute_api", "futures", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -2748,6 +2758,38 @@ dependencies = [ "url", ] +[[package]] +name = "http-utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "backtrace", + "bytes", + "fail", + "flate2", + "hyper 0.14.30", + "inferno 0.12.0", + "itertools 0.10.5", + "jemalloc_pprof", + "metrics", + "once_cell", + "pprof", + "regex", + "routerify", + "serde", + "serde_json", + "serde_path_to_error", + "thiserror 1.0.69", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", + "utils", + "uuid", + "workspace_hack", +] + [[package]] name = "httparse" version = "1.8.0" @@ -4102,6 +4144,7 @@ dependencies = [ "futures", "hex", "hex-literal", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -4202,6 +4245,7 @@ dependencies = [ "anyhow", "bytes", "futures", + "http-utils", "pageserver_api", "postgres", "reqwest", @@ -4908,6 +4952,7 @@ dependencies = [ "hostname", "http 1.1.0", "http-body-util", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -5755,6 +5800,7 @@ dependencies = [ "futures", "hex", "http 1.1.0", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", @@ -5819,6 +5865,7 @@ dependencies = [ name = "safekeeper_client" version = "0.1.0" dependencies = [ + "http-utils", "reqwest", "safekeeper_api", "serde", @@ -6401,6 +6448,7 @@ dependencies = [ "fail", "futures", "hex", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", @@ -7565,48 +7613,38 @@ dependencies = [ "criterion", "diatomic-waker", "fail", - "flate2", "futures", "git-version", "hex", "hex-literal", "humantime", - "hyper 0.14.30", "inferno 0.12.0", - "itertools 0.10.5", - "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", - "pprof", "pq_proto", "rand 0.8.5", "regex", - "routerify", "scopeguard", "sentry", "serde", "serde_assert", "serde_json", - "serde_path_to_error", "serde_with", "signal-hook", "strum", "strum_macros", "thiserror 1.0.69", "tokio", - "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", "tracing", "tracing-error", "tracing-subscriber", - "url", - "uuid", "walkdir", ] @@ -8201,6 +8239,7 @@ dependencies = [ "tracing-core", "tracing-log", "url", + "uuid", "zerocopy", "zeroize", "zstd", diff --git a/Cargo.toml b/Cargo.toml index 76b54ae1d8..7228623c6b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ members = [ "storage_scrubber", "workspace_hack", "libs/compute_api", + "libs/http-utils", "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", @@ -229,6 +230,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } diff --git a/Dockerfile b/Dockerfile index 7ba54c8ca5..b399bcf7e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,28 @@ ARG STABLE_PG_VERSION=16 ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot @@ -59,7 +81,7 @@ RUN set -e \ # Build final image # -FROM debian:${DEBIAN_FLAVOR} +FROM $BASE_IMAGE_SHA ARG DEFAULT_PG_VERSION WORKDIR /data @@ -112,4 +134,3 @@ EXPOSE 6400 EXPOSE 9898 CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] - diff --git a/Makefile b/Makefile index d1238caebf..42ee643bb5 100644 --- a/Makefile +++ b/Makefile @@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu # environment variable. # BUILD_TYPE ?= debug +WITH_SANITIZERS ?= no ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS = -O2 -g3 $(CFLAGS) + PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) + PG_LDFLAGS = $(LDFLAGS) else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +ifeq ($(WITH_SANITIZERS),yes) + PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover + COPT += -Wno-error # to avoid failing on warnings induced by sanitizers + PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS) + export CC := gcc + export ASAN_OPTIONS := detect_leaks=0 +endif + ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) # Exclude static build openssl, icu for local build (MacOS, Linux) # Only keep for build type release and debug @@ -33,7 +44,9 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux - PG_CONFIGURE_OPTS += --with-libseccomp + ifneq ($(WITH_SANITIZERS),yes) + PG_CONFIGURE_OPTS += --with-libseccomp + endif else ifeq ($(UNAME_S),Darwin) PG_CFLAGS += -DUSE_PREFETCH ifndef DISABLE_HOMEBREW @@ -106,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ - CFLAGS='$(PG_CFLAGS)' \ + CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 3ade57b175..fa72ca1bc2 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -1,6 +1,29 @@ ARG DEBIAN_VERSION=bookworm +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim -FROM debian:bookworm-slim AS pgcopydb_builder +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + +FROM $BASE_IMAGE_SHA AS pgcopydb_builder ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early @@ -9,9 +32,11 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc +COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch + RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ set -e && \ apt update && \ @@ -44,6 +69,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ mkdir /tmp/pgcopydb && \ tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \ cd /tmp/pgcopydb && \ + patch -p1 < /pgcopydbv017.patch && \ make -s clean && \ make -s -j12 install && \ libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \ @@ -55,7 +81,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \ fi -FROM debian:${DEBIAN_VERSION}-slim AS build_tools +FROM $BASE_IMAGE_SHA AS build_tools ARG DEBIAN_VERSION # Add nonroot user @@ -72,7 +98,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc # System deps @@ -135,7 +161,8 @@ RUN curl -fsSL \ --output sql_exporter.tar.gz \ && mkdir /tmp/sql_exporter \ && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \ - && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter + && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \ + && rm sql_exporter.tar.gz # protobuf-compiler (protoc) ENV PROTOC_VERSION=25.1 diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch new file mode 100644 index 0000000000..4e68793afc --- /dev/null +++ b/build_tools/patches/pgcopydbv017.patch @@ -0,0 +1,57 @@ +diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c +index d730b03..69a9be9 100644 +--- a/src/bin/pgcopydb/copydb.c ++++ b/src/bin/pgcopydb/copydb.c +@@ -44,6 +44,7 @@ GUC dstSettings[] = { + { "synchronous_commit", "'off'" }, + { "statement_timeout", "0" }, + { "lock_timeout", "0" }, ++ { "idle_in_transaction_session_timeout", "0" }, + { NULL, NULL }, + }; + +diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c +index 94f2f46..e051ba8 100644 +--- a/src/bin/pgcopydb/pgsql.c ++++ b/src/bin/pgcopydb/pgsql.c +@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql, + + LinesBuffer lbuf = { 0 }; + ++ if (message != NULL){ ++ // make sure message is writable by splitLines ++ message = strdup(message); ++ } ++ + if (!splitLines(&lbuf, message)) + { + /* errors have already been logged */ +@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql, + PQbackendPID(pgsql->connection), + lbuf.lines[lineNumber]); + } ++ free(message); // free copy of message we created above + + if (pgsql->logSQL) + { +@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context) + /* errors have already been logged */ + return; + } +- + if (res != NULL) + { + char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); +- strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); ++ if (sqlstate == NULL) ++ { ++ // PQresultErrorField returned NULL! ++ pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault ++ } ++ else ++ { ++ strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); ++ } + } + + char *endpoint = diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 43910f2622..6814aadcb9 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -83,7 +83,28 @@ ARG TAG=pinned ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim -ARG ALPINE_CURL_VERSION=8.11.1 + +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} # By default, build all PostgreSQL extensions. For quick local testing when you don't # care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal @@ -94,7 +115,7 @@ ARG EXTENSIONS=all # Layer "build-deps" # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS build-deps +FROM $BASE_IMAGE_SHA AS build-deps ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early @@ -103,7 +124,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # By default, /bin/sh used in debian images will treat '\n' as eol, # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ - echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc RUN case $DEBIAN_VERSION in \ @@ -127,7 +148,7 @@ RUN case $DEBIAN_VERSION in \ apt install --no-install-recommends --no-install-suggests -y \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* @@ -139,11 +160,11 @@ RUN case $DEBIAN_VERSION in \ ######################################################################################### FROM build-deps AS pg-build ARG PG_VERSION -COPY vendor/postgres-${PG_VERSION} postgres +COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ - if [ "${PG_VERSION}" != "v14" ]; then \ + if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ fi && \ @@ -237,7 +258,7 @@ RUN case "${DEBIAN_VERSION}" in \ # Postgis 3.5.0 supports v17 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export POSTGIS_VERSION=3.5.0 \ export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \ @@ -312,7 +333,7 @@ FROM build-deps AS pgrouting-src ARG DEBIAN_VERSION ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export PGROUTING_VERSION=3.6.2 \ export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \ @@ -358,7 +379,7 @@ COPY compute/patches/plv8-3.1.10.patch . # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ ;; \ @@ -372,7 +393,7 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi FROM pg-build AS plv8-build ARG PG_VERSION @@ -392,7 +413,7 @@ RUN \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - case "${PG_VERSION}" in \ + case "${PG_VERSION:?}" in \ "v17") \ ln -s plv8-3.2.3.so plv8-3.1.8.so && \ ln -s plv8-3.2.3.so plv8-3.1.5.so && \ @@ -729,7 +750,7 @@ FROM build-deps AS timescaledb-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -767,7 +788,7 @@ ARG PG_VERSION # version-specific, has separate releases for each version WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -843,7 +864,7 @@ ARG PG_VERSION # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ @@ -970,7 +991,7 @@ ARG PG_VERSION # # last release v0.40.0 - Jul 22, 2024 WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \ @@ -1006,7 +1027,7 @@ ARG PG_VERSION # This is our extension, support stopped in favor of pgvector # TODO: deprecate it WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -1039,7 +1060,7 @@ ARG PG_VERSION # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. WORKDIR /ext-src -RUN case "${PG_VERSION}" in "v17") \ +RUN case "${PG_VERSION:?}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ @@ -1091,7 +1112,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux FROM pg-build-nonroot-with-cargo AS rust-extensions-build ARG PG_VERSION -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ 'v17') \ echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ esac && \ @@ -1270,7 +1291,7 @@ FROM build-deps AS pgx_ulid-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v14" | "v15" | "v16") \ ;; \ *) \ @@ -1302,7 +1323,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src ARG PG_VERSION WORKDIR /ext-src -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ ;; \ *) \ @@ -1430,8 +1451,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pg_mooncake-src ARG PG_VERSION WORKDIR /ext-src -RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \ - echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ + echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ chmod a+x neon-test.sh @@ -1578,7 +1599,15 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy +RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ + --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ + --mount=type=cache,uid=1000,target=/home/nonroot/target \ + mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ + mkdir target-bin && \ + cp target/release-line-debug-size-lto/compute_ctl \ + target/release-line-debug-size-lto/fast_import \ + target/release-line-debug-size-lto/local_proxy \ + target-bin ######################################################################################### # @@ -1586,7 +1615,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS pgbouncer +FROM $BASE_IMAGE_SHA AS pgbouncer RUN set -e \ && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ @@ -1607,7 +1636,7 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= @@ -1616,13 +1645,12 @@ RUN set -e \ # Layer "exporters" # ######################################################################################### -FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters +FROM build-deps AS exporters ARG TARGETARCH # Keep sql_exporter version same as in build-tools.Dockerfile and # test_runner/regress/test_compute_metrics.py # See comment on the top of the file regading `echo`, `-e` and `\n` -RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \ - if [ "$TARGETARCH" = "amd64" ]; then\ +RUN if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ @@ -1641,6 +1669,29 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30 && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - +######################################################################################### +# +# Layer "awscli" +# +######################################################################################### +FROM build-deps AS awscli +ARG TARGETARCH +RUN set -ex; \ + if [ "${TARGETARCH}" = "amd64" ]; then \ + TARGETARCH_ALT="x86_64"; \ + CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ + elif [ "${TARGETARCH}" = "arm64" ]; then \ + TARGETARCH_ALT="aarch64"; \ + CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ + else \ + echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ + fi; \ + curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ + echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ + unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ + /tmp/awscliv2/aws/install; \ + rm -rf /tmp/awscliv2.zip /tmp/awscliv2 + ######################################################################################### # # Clean up postgres folder before inclusion @@ -1673,7 +1724,7 @@ USER nonroot COPY --chown=nonroot compute compute -RUN make PG_VERSION="${PG_VERSION}" -C compute +RUN make PG_VERSION="${PG_VERSION:?}" -C compute ######################################################################################### # @@ -1706,8 +1757,8 @@ COPY --from=hll-src /ext-src/ /ext-src/ COPY --from=plpgsql_check-src /ext-src/ /ext-src/ #COPY --from=timescaledb-src /ext-src/ /ext-src/ COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ -COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch +COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch COPY --from=pg_cron-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-src /ext-src/ /ext-src/ #COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ @@ -1736,51 +1787,12 @@ ENV PGDATABASE=postgres # Put it all together into the final image # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR +FROM $BASE_IMAGE_SHA ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] -# Add user postgres -RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ - echo "postgres:test_console_pass" | chpasswd && \ - mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ - mkdir /var/db/postgres/pgbouncer && \ - chown -R postgres:postgres /var/db/postgres && \ - chmod 0750 /var/db/postgres/compute && \ - chmod 0750 /var/db/postgres/pgbouncer && \ - echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \ - # create folder for file cache - mkdir -p -m 777 /neon/cache - -COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import - -# pgbouncer and its config -COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer -COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini - -# local_proxy and its config -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy -RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy - -# Metrics exporter binaries and configuration files -COPY --from=exporters ./postgres_exporter /bin/postgres_exporter -COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter -COPY --from=exporters ./sql_exporter /bin/sql_exporter - -COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml - -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml - -# Create remote extension download directory -RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions - # Install: # libreadline8 for psql # liblz4-1 for lz4 @@ -1790,10 +1802,9 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca # libzstd1 for zstd # libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl - +# libevent for pgbouncer RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc - RUN apt update && \ case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): @@ -1828,33 +1839,57 @@ RUN apt update && \ libxslt1.1 \ libzstd1 \ libcurl4 \ + libevent-2.1-7 \ locales \ procps \ ca-certificates \ - curl \ - unzip \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step) -ARG TARGETARCH -RUN set -ex; \ - if [ "${TARGETARCH}" = "amd64" ]; then \ - TARGETARCH_ALT="x86_64"; \ - CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ - elif [ "${TARGETARCH}" = "arm64" ]; then \ - TARGETARCH_ALT="aarch64"; \ - CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ - else \ - echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ - fi; \ - curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ - echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ - unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ - /tmp/awscliv2/aws/install; \ - rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \ - true +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + mkdir /var/db/postgres/pgbouncer && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + chmod 0750 /var/db/postgres/pgbouncer && \ + # create folder for file cache + mkdir -p -m 777 /neon/cache && \ + # Create remote extension download directory + mkdir /usr/local/download_extensions && \ + chown -R postgres:postgres /usr/local/download_extensions + +# aws cli is used by fast_import +COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli + +# pgbouncer and its config +COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer +COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini + +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import + +# local_proxy and its config +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy +RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy + +# Metrics exporter binaries and configuration files +COPY --from=exporters ./postgres_exporter /bin/postgres_exporter +COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter +COPY --from=exporters ./sql_exporter /bin/sql_exporter + +COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml + +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml + +# Make the libraries we built available +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig ENV LANG=en_US.utf8 USER postgres diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch index 4039a036df..1fc3ffa609 100644 --- a/compute/patches/pg_hint_plan_v16.patch +++ b/compute/patches/pg_hint_plan_v16.patch @@ -6,16 +6,16 @@ index da723b8..5328114 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -35,7 +35,7 @@ index d372459..6282afe 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch index dbf4e470ea..3442a094eb 100644 --- a/compute/patches/pg_hint_plan_v17.patch +++ b/compute/patches/pg_hint_plan_v17.patch @@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 005143fff3..568f0b0444 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -47,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -72,8 +74,8 @@ build: | # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset # for debian version migration. - # - FROM debian:bookworm-slim as libcgroup-builder + ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 + FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 2fe50c3a45..124c40cf5d 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -68,7 +68,8 @@ build: | # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. - FROM debian:bullseye-slim as libcgroup-builder + ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 47fc9cb7fe..df47adda6c 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -41,12 +41,14 @@ use std::process::exit; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; +use std::time::SystemTime; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Parser; use compute_tools::disk_quota::set_disk_quota; +use compute_tools::http::server::Server; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; @@ -61,7 +63,6 @@ use compute_tools::compute::{ }; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version_string; -use compute_tools::http::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result { } } +/// Generate a compute ID if one is not supplied. This exists to keep forward +/// compatibility tests working, but will be removed in a future iteration. +fn generate_compute_id() -> String { + let now = SystemTime::now(); + + format!( + "compute-{}", + now.duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + ) +} + #[derive(Parser)] #[command(rename_all = "kebab-case")] struct Cli { @@ -94,8 +108,20 @@ struct Cli { #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] pub remote_ext_config: Option, - #[arg(long, default_value_t = 3080)] - pub http_port: u16, + /// The port to bind the external listening HTTP server to. Clients running + /// outside the compute will talk to the compute through this port. Keep + /// the previous name for this argument around for a smoother release + /// with the control plane. + /// + /// TODO: Remove the alias after the control plane release which teaches the + /// control plane about the renamed argument. + #[arg(long, alias = "http-port", default_value_t = 3080)] + pub external_http_port: u16, + + /// The port to bind the internal listening HTTP server to. Clients like + /// the neon extension (for installing remote extensions) and local_proxy. + #[arg(long)] + pub internal_http_port: Option, #[arg(short = 'D', long, value_name = "DATADIR")] pub pgdata: String, @@ -130,17 +156,26 @@ struct Cli { #[arg(short = 'S', long, group = "spec-path")] pub spec_path: Option, - #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])] - pub compute_id: Option, + #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())] + pub compute_id: String, - #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")] + #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] pub control_plane_uri: Option, } fn main() -> Result<()> { let cli = Cli::parse(); - let build_tag = init()?; + // For historical reasons, the main thread that processes the spec and launches postgres + // is synchronous, but we always have this tokio runtime available and we "enter" it so + // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) + // from all parts of compute_ctl. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let _rt_guard = runtime.enter(); + + let build_tag = runtime.block_on(init())?; let scenario = failpoint_support::init(); @@ -172,8 +207,8 @@ fn main() -> Result<()> { deinit_and_exit(wait_pg_result); } -fn init() -> Result { - init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; +async fn init() -> Result { + init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { @@ -259,20 +294,11 @@ fn try_spec_from_cli(cli: &Cli) -> Result { }); } - if cli.compute_id.is_none() { - panic!( - "compute spec should be provided by one of the following ways: \ - --spec OR --spec-path OR --control-plane-uri and --compute-id" - ); - }; if cli.control_plane_uri.is_none() { - panic!("must specify both --control-plane-uri and --compute-id or none"); + panic!("must specify --control-plane-uri"); }; - match get_spec_from_control_plane( - cli.control_plane_uri.as_ref().unwrap(), - cli.compute_id.as_ref().unwrap(), - ) { + match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { Ok(spec) => Ok(CliSpecParams { spec, live_config_allowed: true, @@ -319,13 +345,15 @@ fn wait_spec( let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) .context("cannot build tokio postgres config from connstr")?; let compute_node = ComputeNode { + compute_id: cli.compute_id.clone(), connstr, conn_conf, tokio_conn_conf, pgdata: cli.pgdata.clone(), pgbin: cli.pgbin.clone(), pgversion: get_pg_version_string(&cli.pgbin), - http_port: cli.http_port, + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1), live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -343,10 +371,13 @@ fn wait_spec( compute.prewarm_postgres()?; } - // Launch http service first, so that we can serve control-plane requests - // while configuration is still in progress. - let _http_handle = - launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread"); + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + Server::External(cli.external_http_port).launch(&compute); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. @@ -484,21 +515,6 @@ fn start_postgres( use std::env; use tokio_util::sync::CancellationToken; - // Note: it seems like you can make a runtime in an inner scope and - // if you start a task in it it won't be dropped. However, make it - // in the outermost scope just to be safe. - let rt = if env::var_os("AUTOSCALING").is_some() { - Some( - tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .enable_all() - .build() - .expect("failed to create tokio runtime for monitor") - ) - } else { - None - }; - // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); @@ -509,16 +525,19 @@ fn start_postgres( Some(cli.filecache_connstr.clone()) }; - let vm_monitor = rt.as_ref().map(|rt| { - rt.spawn(vm_monitor::start( + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: Some(cli.cgroup.clone()), pgconnstr, addr: cli.vm_monitor_addr.clone(), })), token.clone(), - )) - }); + )); + Some(vm_monitor) + } else { + None + }; } } @@ -528,8 +547,6 @@ fn start_postgres( delay_exit, compute, #[cfg(target_os = "linux")] - rt, - #[cfg(target_os = "linux")] token, #[cfg(target_os = "linux")] vm_monitor, @@ -537,15 +554,13 @@ fn start_postgres( )) } -type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); +type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); struct StartPostgresResult { delay_exit: bool, // passed through from WaitSpecResult compute: Arc, - #[cfg(target_os = "linux")] - rt: Option, #[cfg(target_os = "linux")] token: tokio_util::sync::CancellationToken, #[cfg(target_os = "linux")] @@ -564,10 +579,10 @@ fn wait_postgres(pg: Option) -> Result { .expect("failed to start waiting on Postgres process"); PG_PID.store(0, Ordering::SeqCst); - // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() @@ -588,8 +603,6 @@ fn cleanup_after_postgres_exit( vm_monitor, #[cfg(target_os = "linux")] token, - #[cfg(target_os = "linux")] - rt, }: StartPostgresResult, ) -> Result { // Terminate the vm_monitor so it releases the file watcher on @@ -602,10 +615,6 @@ fn cleanup_after_postgres_exit( token.cancel(); // Kills the actual task running the monitor handle.abort(); - - // If handle is some, rt must have been used to produce it, and - // hence is also some - rt.unwrap().shutdown_timeout(Duration::from_secs(2)); } } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index c8440afb64..27cf1c2317 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -60,6 +60,16 @@ struct Args { pg_lib_dir: Utf8PathBuf, #[clap(long)] pg_port: Option, // port to run postgres on, 5432 is default + + /// Number of CPUs in the system. This is used to configure # of + /// parallel worker processes, for index creation. + #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] + num_cpus: Option, + + /// Amount of RAM in the system. This is used to configure shared_buffers + /// and maintenance_work_mem. + #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] + memory_mb: Option, } #[serde_with::serde_as] @@ -202,7 +212,16 @@ pub(crate) async fn main() -> anyhow::Result<()> { .await .context("initdb")?; - let nproc = num_cpus::get(); + // If the caller didn't specify CPU / RAM to use for sizing, default to + // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM. + let nproc = args.num_cpus.unwrap_or_else(num_cpus::get); + let memory_mb = args.memory_mb.unwrap_or(256); + + // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for + // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest + // available for misc other stuff that PostgreSQL uses memory for. + let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; + let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; // // Launch postgres process @@ -212,12 +231,15 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg(&pgdata_dir) .args(["-p", &format!("{pg_port}")]) .args(["-c", "wal_level=minimal"]) - .args(["-c", "shared_buffers=10GB"]) + .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) .args(["-c", "max_wal_senders=0"]) .args(["-c", "fsync=off"]) .args(["-c", "full_page_writes=off"]) .args(["-c", "synchronous_commit=off"]) - .args(["-c", "maintenance_work_mem=8388608"]) + .args([ + "-c", + &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), + ]) .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) .args(["-c", &format!("max_parallel_workers={nproc}")]) .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) @@ -231,6 +253,14 @@ pub(crate) async fn main() -> anyhow::Result<()> { ]) .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 4a297cfacf..28b10ce21c 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -140,5 +140,34 @@ pub async fn get_database_schema( warn!("pg_dump stderr: {}", line) } }); - Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) + + #[allow(dead_code)] + struct SchemaStream { + // We keep a reference to the child process to ensure it stays alive + // while the stream is being consumed. When SchemaStream is dropped, + // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump + cmd: tokio::process::Child, + stream: S, + } + + impl Stream for SchemaStream + where + S: Stream> + Unpin, + { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx) + } + } + + let schema_stream = SchemaStream { + cmd, + stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))), + }; + + Ok(schema_stream) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index fd76e404c6..d323ea3dcd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -9,7 +9,6 @@ use std::str::FromStr; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::thread; use std::time::Duration; use std::time::Instant; @@ -59,6 +58,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0); /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { + /// The ID of the compute + pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, // We connect to Postgres from many different places, so build configs once @@ -81,8 +82,10 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's HTTP server listens on - pub http_port: u16, + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -546,11 +549,7 @@ impl ComputeNode { pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); - // Run actual work with new tokio runtime - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); + let rt = tokio::runtime::Handle::current(); let result = rt.block_on(self.check_safekeepers_synced_async(compute_state)); // Record runtime @@ -597,9 +596,9 @@ impl ComputeNode { SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); if !sync_output.status.success() { anyhow::bail!( @@ -634,7 +633,7 @@ impl ComputeNode { config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - self.http_port, + self.internal_http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -784,7 +783,7 @@ impl ComputeNode { pub fn start_postgres( &self, storage_auth_token: Option, - ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. @@ -800,7 +799,7 @@ impl ComputeNode { .expect("cannot start postgres process"); PG_PID.store(pg.id(), Ordering::SeqCst); - // Start a thread to collect logs from stderr. + // Start a task to collect logs from stderr. let stderr = pg.stderr.take().expect("stderr should be captured"); let logs_handle = handle_postgres_logs(stderr); @@ -809,20 +808,28 @@ impl ComputeNode { Ok((pg, logs_handle)) } - /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// Do post configuration of the already started Postgres. This function spawns a background task to /// configure the database after applying the compute spec. Currently, it upgrades the neon extension /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { - let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config")); - thread::spawn(move || { - let func = || { - let mut client = conf.connect(NoTls)?; + let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config")); + tokio::spawn(async move { + let res = async { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + handle_neon_extension_upgrade(&mut client) + .await .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) - }; - if let Err(err) = func() { + } + .await; + if let Err(err) = res { error!("error while post_apply_config: {err:#}"); } }); @@ -919,13 +926,10 @@ impl ComputeNode { conf: Arc, concurrency: usize, ) -> Result<()> { - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()?; - info!("Applying config with max {} concurrency", concurrency); debug!("Config: {:?}", spec); + let rt = tokio::runtime::Handle::current(); rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. let client = Self::get_maintenance_client(&conf).await?; @@ -1319,14 +1323,18 @@ impl ComputeNode { } // Run migrations separately to not hold up cold starts - thread::spawn(move || { - let conf = conf.as_ref().clone(); - let mut conf = postgres::config::Config::from(conf); + tokio::spawn(async move { + let mut conf = conf.as_ref().clone(); conf.application_name("compute_ctl:migrations"); - match conf.connect(NoTls) { - Ok(mut client) => { - if let Err(e) = handle_migrations(&mut client) { + match conf.connect(NoTls).await { + Ok((mut client, connection)) => { + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + if let Err(e) = handle_migrations(&mut client).await { error!("Failed to run migrations: {}", e); } } @@ -1363,16 +1371,11 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1382,41 +1385,42 @@ impl ComputeNode { if let Some(ref local_proxy) = spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = Some(thread::spawn(move || { + tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } - })); + }); } // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; - let max_concurrent_connections = spec.reconfigure_concurrency; + if !spec.skip_pg_catalog_updates { + let max_concurrent_connections = spec.reconfigure_concurrency; + // Temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are reconfiguring: + // creating new extensions, roles, etc. + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + self.pg_reload_conf()?; - // Temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are reconfiguring: - // creating new extensions, roles, etc. - config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { - self.pg_reload_conf()?; + if spec.mode == ComputeMode::Primary { + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + let conf = Arc::new(conf); - if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); - let conf = Arc::new(conf); + let spec = Arc::new(spec.clone()); - let spec = Arc::new(spec.clone()); + self.apply_spec_sql(spec, conf, max_concurrent_connections)?; + } - self.apply_spec_sql(spec, conf, max_concurrent_connections)?; - } - - Ok(()) - })?; + Ok(()) + })?; + } self.pg_reload_conf()?; @@ -1431,7 +1435,9 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + pub fn start_compute( + &self, + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -1446,16 +1452,11 @@ impl ComputeNode { if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1465,10 +1466,10 @@ impl ComputeNode { if let Some(local_proxy) = &pspec.spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = thread::spawn(move || { + let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } @@ -1487,7 +1488,8 @@ impl ComputeNode { extension_server::create_control_files(remote_extensions, &self.pgbin); let library_load_start_time = Utc::now(); - let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?; + let rt = tokio::runtime::Handle::current(); + let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; let library_load_time = Utc::now() .signed_duration_since(library_load_start_time) @@ -1542,7 +1544,7 @@ impl ComputeNode { self.post_apply_config()?; let conf = self.get_conn_conf(None); - thread::spawn(move || { + tokio::task::spawn_blocking(|| { let res = get_installed_extensions(conf); match res { Ok(extensions) => { @@ -1891,7 +1893,6 @@ LIMIT 100", Ok(ext_version) } - #[tokio::main] pub async fn prepare_preload_libraries( &self, spec: &ComputeSpec, diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index a2043529a1..d88f26ca20 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc) { pub fn launch_configurator(compute: &Arc) -> thread::JoinHandle<()> { let compute = Arc::clone(compute); + let runtime = tokio::runtime::Handle::current(); + thread::Builder::new() .name("compute-configurator".into()) .spawn(move || { + let _rt_guard = runtime.enter(); configurator_main_loop(&compute); info!("configurator thread is exited"); }) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index a596bea504..93eb6ef5b7 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode}; use serde::Serialize; use tracing::error; -pub use server::launch_http_server; - mod extract; mod routes; -mod server; +pub mod server; /// Convenience response builder for JSON responses struct JsonResponse; diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs index 2ec4511676..836417d784 100644 --- a/compute_tools/src/http/routes/failpoints.rs +++ b/compute_tools/src/http/routes/failpoints.rs @@ -1,7 +1,21 @@ use axum::response::{IntoResponse, Response}; use http::StatusCode; +use serde::{Deserialize, Serialize}; use tracing::info; -use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest}; +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} use crate::http::{extract::Json, JsonResponse}; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index e41ed9df2d..a523ecd96f 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -1,7 +1,7 @@ use std::{ + fmt::Display, net::{IpAddr, Ipv6Addr, SocketAddr}, sync::Arc, - thread, time::Duration, }; @@ -26,46 +26,65 @@ use super::routes::{ }; use crate::compute::ComputeNode; -async fn handle_404() -> Response { - StatusCode::NOT_FOUND.into_response() -} - const X_REQUEST_ID: &str = "x-request-id"; -/// This middleware function allows compute_ctl to generate its own request ID -/// if one isn't supplied. The control plane will always send one as a UUID. The -/// neon Postgres extension on the other hand does not send one. -async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { - let headers = request.headers_mut(); - - if headers.get(X_REQUEST_ID).is_none() { - headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); - } - - next.run(request).await +/// `compute_ctl` has two servers: internal and external. The internal server +/// binds to the loopback interface and handles communication from clients on +/// the compute. The external server is what receives communication from the +/// control plane, the metrics scraper, etc. We make the distinction because +/// certain routes in `compute_ctl` only need to be exposed to local processes +/// like Postgres via the neon extension and local_proxy. +#[derive(Clone, Copy, Debug)] +pub enum Server { + Internal(u16), + External(u16), } -/// Run the HTTP server and wait on it forever. -#[tokio::main] -async fn serve(port: u16, compute: Arc) { - let mut app = Router::new() - .route("/check_writability", post(check_writability::is_writable)) - .route("/configure", post(configure::configure)) - .route("/database_schema", get(database_schema::get_schema_dump)) - .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) - .route( - "/extension_server/{*filename}", - post(extension_server::download_extension), - ) - .route("/extensions", post(extensions::install_extension)) - .route("/grants", post(grants::add_grant)) - .route("/insights", get(insights::get_insights)) - .route("/metrics", get(metrics::get_metrics)) - .route("/metrics.json", get(metrics_json::get_metrics)) - .route("/status", get(status::get_status)) - .route("/terminate", post(terminate::terminate)) - .fallback(handle_404) - .layer( +impl Display for Server { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Server::Internal(_) => f.write_str("internal"), + Server::External(_) => f.write_str("external"), + } + } +} + +impl From for Router> { + fn from(server: Server) -> Self { + let mut router = Router::>::new(); + + router = match server { + Server::Internal(_) => { + router = router + .route( + "/extension_server/{*filename}", + post(extension_server::download_extension), + ) + .route("/extensions", post(extensions::install_extension)) + .route("/grants", post(grants::add_grant)); + + // Add in any testing support + if cfg!(feature = "testing") { + use super::routes::failpoints; + + router = router.route("/failpoints", post(failpoints::configure_failpoints)); + } + + router + } + Server::External(_) => router + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics", get(metrics::get_metrics)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)), + }; + + router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( ServiceBuilder::new() // Add this middleware since we assume the request ID exists .layer(middleware::from_fn(maybe_add_request_id_header)) @@ -105,45 +124,88 @@ async fn serve(port: u16, compute: Arc) { ) .layer(PropagateRequestIdLayer::x_request_id()), ) - .with_state(compute); + } +} - // Add in any testing support - if cfg!(feature = "testing") { - use super::routes::failpoints; - - app = app.route("/failpoints", post(failpoints::configure_failpoints)) +impl Server { + async fn handle_404() -> impl IntoResponse { + StatusCode::NOT_FOUND } - // This usually binds to both IPv4 and IPv6 on Linux, see - // https://github.com/rust-lang/rust/pull/34440 for more information - let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); - let listener = match TcpListener::bind(&addr).await { - Ok(listener) => listener, - Err(e) => { - error!( - "failed to bind the compute_ctl HTTP server to port {}: {}", - port, e - ); - return; + async fn handle_405() -> impl IntoResponse { + StatusCode::METHOD_NOT_ALLOWED + } + + async fn listener(&self) -> Result { + let addr = SocketAddr::new(self.ip(), self.port()); + let listener = TcpListener::bind(&addr).await?; + + Ok(listener) + } + + fn ip(&self) -> IpAddr { + match self { + // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners + // allow binding to localhost + Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), } - }; - - if let Ok(local_addr) = listener.local_addr() { - info!("compute_ctl HTTP server listening on {}", local_addr); - } else { - info!("compute_ctl HTTP server listening on port {}", port); } - if let Err(e) = axum::serve(listener, app).await { - error!("compute_ctl HTTP server error: {}", e); + fn port(self) -> u16 { + match self { + Server::Internal(port) => port, + Server::External(port) => port, + } + } + + async fn serve(self, compute: Arc) { + let listener = self.listener().await.unwrap_or_else(|e| { + // If we can't bind, the compute cannot operate correctly + panic!( + "failed to bind the compute_ctl {} HTTP server to {}: {}", + self, + SocketAddr::new(self.ip(), self.port()), + e + ); + }); + + if tracing::enabled!(tracing::Level::INFO) { + let local_addr = match listener.local_addr() { + Ok(local_addr) => local_addr, + Err(_) => SocketAddr::new(self.ip(), self.port()), + }; + + info!( + "compute_ctl {} HTTP server listening at {}", + self, local_addr + ); + } + + let router = Router::from(self).with_state(compute); + + if let Err(e) = axum::serve(listener, router).await { + error!("compute_ctl {} HTTP server error: {}", self, e); + } + } + + pub fn launch(self, compute: &Arc) { + let state = Arc::clone(compute); + + info!("Launching the {} server", self); + + tokio::spawn(self.serve(state)); } } -/// Launch a separate HTTP server thread and return its `JoinHandle`. -pub fn launch_http_server(port: u16, state: &Arc) -> Result> { - let state = Arc::clone(state); +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if headers.get(X_REQUEST_ID).is_none() { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } - Ok(thread::Builder::new() - .name("http-server".into()) - .spawn(move || serve(port, state))?) + next.run(request).await } diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 00be5c13f9..3749dfc844 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*; /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See /// `tracing-utils` package description. /// -pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { +pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); @@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl"); + let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; // Put it all together tracing_subscriber::registry() diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 7b7b042d84..c5e05822c0 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Result}; use fail::fail_point; -use postgres::{Client, Transaction}; +use tokio_postgres::{Client, Transaction}; use tracing::{error, info}; use crate::metrics::DB_MIGRATION_FAILED; @@ -21,10 +21,11 @@ impl<'m> MigrationRunner<'m> { } /// Get the current value neon_migration.migration_id - fn get_migration_id(&mut self) -> Result { + async fn get_migration_id(&mut self) -> Result { let row = self .client - .query_one("SELECT id FROM neon_migration.migration_id", &[])?; + .query_one("SELECT id FROM neon_migration.migration_id", &[]) + .await?; Ok(row.get::<&str, i64>("id")) } @@ -34,7 +35,7 @@ impl<'m> MigrationRunner<'m> { /// This function has a fail point called compute-migration, which can be /// used if you would like to fail the application of a series of migrations /// at some point. - fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> { + async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> { // We use this fail point in order to check that failing in the // middle of applying a series of migrations fails in an expected // manner @@ -59,31 +60,38 @@ impl<'m> MigrationRunner<'m> { "UPDATE neon_migration.migration_id SET id = $1", &[&migration_id], ) + .await .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?; Ok(()) } /// Prepare the migrations the target database for handling migrations - fn prepare_database(&mut self) -> Result<()> { + async fn prepare_database(&mut self) -> Result<()> { self.client - .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?; - self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?; - self.client.simple_query( - "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", - )?; + .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") + .await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; self.client - .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?; + .simple_query( + "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", + ) + .await?; self.client - .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?; + .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin") + .await?; + self.client + .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC") + .await?; Ok(()) } /// Run an individual migration in a separate transaction block. - fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { + async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { let mut txn = client .transaction() + .await .with_context(|| format!("begin transaction for migration {migration_id}"))?; if migration.starts_with("-- SKIP") { @@ -92,35 +100,38 @@ impl<'m> MigrationRunner<'m> { // Even though we are skipping the migration, updating the // migration ID should help keep logic easy to understand when // trying to understand the state of a cluster. - Self::update_migration_id(&mut txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id).await?; } else { info!("Running migration id={}:\n{}\n", migration_id, migration); txn.simple_query(migration) + .await .with_context(|| format!("apply migration {migration_id}"))?; - Self::update_migration_id(&mut txn, migration_id)?; + Self::update_migration_id(&mut txn, migration_id).await?; } txn.commit() + .await .with_context(|| format!("commit transaction for migration {migration_id}"))?; Ok(()) } /// Run the configured set of migrations - pub fn run_migrations(mut self) -> Result<()> { + pub async fn run_migrations(mut self) -> Result<()> { self.prepare_database() + .await .context("prepare database to handle migrations")?; - let mut current_migration = self.get_migration_id()? as usize; + let mut current_migration = self.get_migration_id().await? as usize; while current_migration < self.migrations.len() { // The index lags the migration ID by 1, so the current migration // ID is also the next index let migration_id = (current_migration + 1) as i64; let migration = self.migrations[current_migration]; - match Self::run_migration(self.client, migration_id, migration) { + match Self::run_migration(self.client, migration_id, migration).await { Ok(_) => { info!("Finished migration id={}", migration_id); } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index e03b410699..86fcf99085 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; use std::str::FromStr; -use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; @@ -16,6 +15,7 @@ use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; use tokio::io::AsyncBufReadExt; +use tokio::task::JoinHandle; use tokio::time::timeout; use tokio_postgres; use tokio_postgres::NoTls; @@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result Ok(()) } -/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs +/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs /// and send them to the logger. In the future we may also want to add context to /// these logs. -pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> { - std::thread::spawn(move || { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to build tokio runtime"); - - let res = runtime.block_on(async move { - let stderr = tokio::process::ChildStderr::from_std(stderr)?; - handle_postgres_logs_async(stderr).await - }); - if let Err(e) = res { - tracing::error!("error while processing postgres logs: {}", e); - } +pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle> { + tokio::spawn(async move { + let stderr = tokio::process::ChildStderr::from_std(stderr)?; + handle_postgres_logs_async(stderr).await }) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 37d5d3a1a6..73950cd95a 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,8 +1,8 @@ use anyhow::{anyhow, bail, Result}; -use postgres::Client; use reqwest::StatusCode; use std::fs::File; use std::path::Path; +use tokio_postgres::Client; use tracing::{error, info, instrument, warn}; use crate::config; @@ -166,17 +166,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { +pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); - client.simple_query(query)?; + client.simple_query(query).await?; Ok(()) } #[instrument(skip_all)] -pub fn handle_migrations(client: &mut Client) -> Result<()> { +pub async fn handle_migrations(client: &mut Client) -> Result<()> { info!("handle migrations"); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -206,7 +206,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { ), ]; - MigrationRunner::new(client, &migrations).run_migrations()?; + MigrationRunner::new(client, &migrations) + .run_migrations() + .await?; Ok(()) } @@ -214,7 +216,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { /// Connect to the database as superuser and pre-create anon extension /// if it is present in shared_preload_libraries #[instrument(skip_all)] -pub fn handle_extension_anon( +pub async fn handle_extension_anon( spec: &ComputeSpec, db_owner: &str, db_client: &mut Client, @@ -227,7 +229,7 @@ pub fn handle_extension_anon( if !grants_only { // check if extension is already initialized using anon.is_initialized() let query = "SELECT anon.is_initialized()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if !rows.is_empty() { let is_initialized: bool = rows[0].get(0); @@ -249,7 +251,7 @@ pub fn handle_extension_anon( // Users cannot create it themselves, because superuser is required. let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; info!("creating anon extension with query: {}", query); - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon extension creation failed with error: {}", e); @@ -259,7 +261,7 @@ pub fn handle_extension_anon( // check that extension is installed query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - let rows = db_client.query(query, &[])?; + let rows = db_client.query(query, &[]).await?; if rows.is_empty() { error!("anon extension is not installed"); return Ok(()); @@ -268,7 +270,7 @@ pub fn handle_extension_anon( // Initialize anon extension // This also requires superuser privileges, so users cannot do it themselves. query = "SELECT anon.init()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon.init() failed with error: {}", e); @@ -279,7 +281,7 @@ pub fn handle_extension_anon( // check that extension is installed, if not bail early let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if rows.is_empty() { error!("anon extension is not installed"); @@ -294,12 +296,12 @@ pub fn handle_extension_anon( let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // Grant permissions to db_owner to use anon extension functions let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // This is needed, because some functions are defined as SECURITY DEFINER. // In Postgres SECURITY DEFINER functions are executed with the privileges @@ -314,16 +316,16 @@ pub fn handle_extension_anon( where nsp.nspname = 'anon';", db_owner); info!("change anon extension functions owner to db owner"); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // affects views as well let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; } } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index f718102847..162c49ec7c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -33,6 +33,7 @@ postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true +http-utils.workspace = true utils.workspace = true whoami.workspace = true diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index af312d73a7..c668e68402 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting); // Pass through these environment variables to the command - for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] { + for var in [ + "LLVM_PROFILE_FILE", + "FAILPOINTS", + "RUST_LOG", + "ASAN_OPTIONS", + "UBSAN_OPTIONS", + ] { if let Some(val) = std::env::var_os(var) { filled_cmd = filled_cmd.env(var, val); } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ba67ffa2dd..02d793400a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs { lsn: Option, #[clap(long)] pg_port: Option, + #[clap(long, alias = "http-port")] + external_http_port: Option, #[clap(long)] - http_port: Option, + internal_http_port: Option, #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, @@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res tenant_id, timeline_id, args.pg_port, - args.http_port, + args.external_http_port, + args.internal_http_port, args.pg_version, mode, !args.update_catalog, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index bc86d09103..3b2634204c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,6 +37,8 @@ //! ``` //! use std::collections::BTreeMap; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; @@ -73,7 +75,8 @@ pub struct EndpointConf { timeline_id: TimelineId, mode: ComputeMode, pg_port: u16, - http_port: u16, + external_http_port: u16, + internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, @@ -128,7 +131,7 @@ impl ComputeControlPlane { 1 + self .endpoints .values() - .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) + .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port())) .max() .unwrap_or(self.base_port) } @@ -140,18 +143,27 @@ impl ComputeControlPlane { tenant_id: TenantId, timeline_id: TimelineId, pg_port: Option, - http_port: Option, + external_http_port: Option, + internal_http_port: Option, pg_version: u32, mode: ComputeMode, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); - let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); + let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); + let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1); let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + internal_http_port, + ), env: self.env.clone(), timeline_id, mode, @@ -176,7 +188,8 @@ impl ComputeControlPlane { tenant_id, timeline_id, mode, - http_port, + external_http_port, + internal_http_port, pg_port, pg_version, skip_pg_catalog_updates, @@ -230,9 +243,10 @@ pub struct Endpoint { pub timeline_id: TimelineId, pub mode: ComputeMode, - // port and address of the Postgres server and `compute_ctl`'s HTTP API + // port and address of the Postgres server and `compute_ctl`'s HTTP APIs pub pg_address: SocketAddr, - pub http_address: SocketAddr, + pub external_http_address: SocketAddr, + pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. pg_version: u32, @@ -287,8 +301,15 @@ impl Endpoint { serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; Ok(Endpoint { - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + conf.external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + conf.internal_http_port, + ), endpoint_id, env: env.clone(), timeline_id: conf.timeline_id, @@ -650,24 +671,51 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - cmd.args(["--http-port", &self.http_address.port().to_string()]) - .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &conn_str]) - .args([ - "--spec-path", - self.endpoint_path().join("spec.json").to_str().unwrap(), - ]) - .args([ - "--pgbin", - self.env - .pg_bin_dir(self.pg_version)? - .join("postgres") - .to_str() - .unwrap(), - ]) - .stdin(std::process::Stdio::null()) - .stderr(logfile.try_clone()?) - .stdout(logfile); + //cmd.args([ + // "--external-http-port", + // &self.external_http_address.port().to_string(), + //]) + //.args([ + // "--internal-http-port", + // &self.internal_http_address.port().to_string(), + //]) + cmd.args([ + "--http-port", + &self.external_http_address.port().to_string(), + ]) + .args(["--pgdata", self.pgdata().to_str().unwrap()]) + .args(["--connstr", &conn_str]) + .args([ + "--spec-path", + self.endpoint_path().join("spec.json").to_str().unwrap(), + ]) + .args([ + "--pgbin", + self.env + .pg_bin_dir(self.pg_version)? + .join("postgres") + .to_str() + .unwrap(), + ]) + // TODO: It would be nice if we generated compute IDs with the same + // algorithm as the real control plane. + // + // TODO: Add this back when + // https://github.com/neondatabase/neon/pull/10747 is merged. + // + //.args([ + // "--compute-id", + // &format!( + // "compute-{}", + // SystemTime::now() + // .duration_since(UNIX_EPOCH) + // .unwrap() + // .as_secs() + // ), + //]) + .stdin(std::process::Stdio::null()) + .stderr(logfile.try_clone()?) + .stdout(logfile); if let Some(remote_ext_config) = remote_ext_config { cmd.args(["--remote-ext-config", remote_ext_config]); @@ -754,8 +802,8 @@ impl Endpoint { reqwest::Method::GET, format!( "http://{}:{}/status", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() ), ) .send() @@ -828,8 +876,8 @@ impl Endpoint { let response = client .post(format!( "http://{}:{}/configure", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") .body(format!( diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index dd37bfc407..28d130d9e0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -357,6 +357,16 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("Failed to parse 'compaction_algorithm' json")?, + compaction_l0_first: settings + .remove("compaction_l0_first") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_first' as a bool")?, + compaction_l0_semaphore: settings + .remove("compaction_l0_semaphore") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_semaphore' as a bool")?, l0_flush_delay_threshold: settings .remove("l0_flush_delay_threshold") .map(|x| x.parse::()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f0c3722925..ce7751fb14 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,8 +17,10 @@ use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; + +use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; -use utils::{http::error::HttpErrorBody, id::NodeId}; +use utils::id::NodeId; use crate::{ background_process, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index c41ff22d15..9a2d30c861 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -221,7 +221,17 @@ impl StorageController { "-p", &format!("{}", postgres_port), ]; - let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + let exitcode = Command::new(bin_path) + .args(args) + .envs(envs) + .spawn()? + .wait() + .await?; Ok(exitcode.success()) } @@ -242,6 +252,11 @@ impl StorageController { let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; let output = Command::new(&createdb_path) .args([ "-h", @@ -254,6 +269,7 @@ impl StorageController { &username(), DB_NAME, ]) + .envs(envs) .output() .await .expect("Failed to spawn createdb"); diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh index b7158d2340..efb8bfc184 100755 --- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh +++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh @@ -2,4 +2,4 @@ set -ex cd "$(dirname ${0})" patch -p1 ), @@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually). #### Schema `safekeepers` table mirroring current `nodes` should be added, except that for -`scheduling_policy` field (seems like `status` is a better name for it): it is enough -to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) -`decomissioned`. +`scheduling_policy`: it is enough to have at least in the beginning only 3 +fields: 1) `active` 2) `paused` (initially means only not assign new tlis there +3) `decomissioned` (node is removed). `timelines` table: ``` @@ -324,18 +324,24 @@ table! { timelines (tenant_id, timeline_id) { timeline_id -> Varchar, tenant_id -> Varchar, + start_lsn -> pg_lsn, generation -> Int4, sk_set -> Array, // list of safekeeper ids - new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf cplane_notified_generation -> Int4, + deleted_at -> Nullable, } } ``` +`start_lsn` is needed to create timeline on safekeepers properly, see below. We +might also want to add ancestor_timeline_id to preserve the hierarchy, but for +this RFC it is not needed. + #### API Node management is similar to pageserver: -1) POST `/control/v1/safekeepers` upserts safekeeper. +1) POST `/control/v1/safekeepers` inserts safekeeper. 2) GET `/control/v1/safekeepers` lists safekeepers. 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. @@ -345,25 +351,15 @@ Node management is similar to pageserver: Safekeeper deploy scripts should register safekeeper at storage_contorller as they currently do with cplane, under the same id. -Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` -would 1) choose initial set of safekeepers; 2) write to the db initial -`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in -case of conflict; 3) create timeline on the majority of safekeepers (already -created is ok). +Timeline creation/deletion will work through already existing POST and DELETE +`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they +succeed. See next section on the implementation details. -We don't want to block timeline creation when one safekeeper is down. Currently -this is solved by compute implicitly creating timeline on any safekeeper it is -connected to. This creates ugly timeline state on safekeeper when timeline is -created, but start LSN is not defined yet. It would be nice to remove this; to -do that, controller can in the background retry to create timeline on -safekeeper(s) which missed that during initial creation call. It can do that -through `pull_timeline` from majority so it doesn't need to remember -`parent_lsn` in its db. - -Timeline deletion removes the row from the db and forwards deletion to the -current configuration members. Without additional actions deletions might leak, -see below on this; initially let's ignore these, reporting to cplane success if -at least one safekeeper deleted the timeline (this will remove s3 data). +We don't want to block timeline creation/deletion when one safekeeper is down. +Currently this is crutched by compute implicitly creating timeline on any +safekeeper it is connected to. This creates ugly timeline state on safekeeper +when timeline is created, but start LSN is not defined yet. Next section +describes dealing with this. Tenant deletion repeats timeline deletion for all timelines. @@ -395,26 +391,6 @@ Similar call should be added for the tenant. It would be great to have some way of subscribing to the results (apart from looking at logs/metrics). -Migration is executed as described above. One subtlety is that (local) deletion on -source safekeeper might fail, which is not a problem if we are going to -decomission the node but leaves garbage otherwise. I'd propose in the first version -1) Don't attempt deletion at all if node status is `offline`. -2) If it failed, just issue warning. -And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and -remove garbage timelines for manual use. It will 1) list all timelines on the -safekeeper 2) compare each one against configuration storage: if timeline -doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can -be deleted under generation number if node is not member of current generation. - -Automating this is untrivial; we'd need to register all potential missing -deletions in the same transaction -which switches configurations. Similarly when timeline is fully deleted to -prevent cplane operation from blocking when some safekeeper is not available -deletion should be also registered. - -One more task pool should infinitely retry notifying control plane about changed -safekeeper sets. - 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return current in memory state of the timeline and pending `MigrationRequest`, if any. @@ -423,12 +399,153 @@ safekeeper sets. migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS (incrementing generation as always). +#### API implementation and reconciliation + +For timeline creation/deletion we want to preserve the basic assumption that +unreachable minority (1 sk of 3) doesn't block their completion, but eventually +we want to finish creation/deletion on nodes which missed it (unless they are +removed). Similarly for migration; it may and should finish even though excluded +members missed their exclusion. And of course e.g. such pending exclusion on +node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As +another example, if some node missed timeline creation it clearly must not block +migration from it. Hence it is natural to have per safekeeper background +reconciler which retries these ops until they succeed. There are 3 possible +operation types, and the type is defined by timeline state (membership +configuration and whether it is deleted) and safekeeper id: we may need to +create timeline on sk (node added), locally delete it (node excluded, somewhat +similar to detach) or globally delete it (timeline is deleted). + +Next, on storage controller restart in principle these pending operations can be +figured out by comparing safekeepers state against storcon state. But it seems +better to me to materialize them in the database; it is not expensive, avoids +these startup scans which themselves can fail etc and makes it very easy to see +outstanding work directly at the source of truth -- the db. So we can add table +`safekeeper_timeline_pending_ops` +``` +table! { + // timeline_id, sk_id is primary key + safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) { + sk_id -> int8, + tenant_id -> Varchar, + timeline_id -> Varchar, + generation -> Int4, + op_type -> Varchar, + } +} +``` + +`op_type` can be `include` (seed from peers and ensure generation is up to +date), `exclude` (remove locally) and `delete`. Field is actually not strictly +needed as it can be computed from current configuration, but gives more explicit +observability. + +`generation` is necessary there because after op is done reconciler must remove +it and not remove another row with higher gen which in theory might appear. + +Any insert of row should overwrite (remove) all rows with the same sk and +timeline id but lower `generation` as next op makes previous obsolete. Insertion +of `op_type` `delete` overwrites all rows. + +About `exclude`: rather than adding explicit safekeeper http endpoint, it is +reasonable to reuse membership switch endpoint: if safekeeper is not member +of the configuration it locally removes the timeline on the switch. In this case +404 should also be considered an 'ok' answer by the caller. + +So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops` +joined with timeline configuration to get current conf (with generation `n`) +for the safekeeper and does the jobs, infinitely retrying failures: +1) If node is member (`include`): + - Check if timeline exists on it, if not, call pull_timeline on it from + other members + - Call switch configuration to the current +2) If node is not member (`exclude`): + - Call switch configuration to the current, 404 is ok. +3) If timeline is deleted (`delete`), call delete. + +In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and +timeline with generation <= `n` if `op_type` is not `delete`. +In case 3 also remove `safekeeper_timeline_pending_ops` +entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline. + +Let's consider in details how APIs can be implemented from this angle. + +Timeline creation. It is assumed that cplane retries it until success, so all +actions must be idempotent. Now, a tricky point here is timeline start LSN. For +the initial (tenant creation) call cplane doesn't know it. However, setting +start_lsn on safekeepers during creation is a good thing -- it provides a +guarantee that walproposer can always find a common point in WAL histories of +safekeeper and its own, and so absense of it would be a clear sign of +corruption. The following sequence works: +1) Create timeline (or observe that it exists) on pageserver, + figuring out last_record_lsn in response. +2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the + db. Note that last_record_lsn returned on the previous step is movable as it + changes once ingestion starts, insert must not overwrite it (as well as other + fields like membership conf). On the contrary, start_lsn used in the next + step must be set to the value in the db. cplane_notified_generation can be set + to 1 (initial generation) in insert to avoid notifying cplane about initial + conf as cplane will receive it in timeline creation request anyway. +3) Issue timeline creation calls to at least majority of safekeepers. Using + majority here is not necessary but handy because it guarantees that any live + majority will have at least one sk with created timeline and so + reconciliation task can use pull_timeline shared with migration instead of + create timeline special init case. OFC if timeline is already exists call is + ignored. +4) For minority of safekeepers which could have missed creation insert + entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion + because response to cplane is sent only after it has happened, and cplane + retries the call until 200 response. + + There is a small question how request handler (timeline creation in this + case) would interact with per sk reconciler. As always I prefer to do the + simplest possible thing and here it seems to be just waking it up so it + re-reads the db for work to do. Passing work in memory is faster, but + that shouldn't matter, and path to scan db for work will exist anyway, + simpler to reuse it. + +For pg version / wal segment size: while we may persist them in `timelines` +table, it is not necessary as initial creation at step 3 can take them from +pageserver or cplane creation call and later pull_timeline will carry them +around. + +Timeline migration. +1) CAS to the db to create joint conf, and in the same transaction create + `safekeeper_timeline_pending_ops` `include` entries to initialize new members + as well as deliver this conf to current ones; poke per sk reconcilers to work + on it. Also any conf change should also poke cplane notifier task(s). +2) Once it becomes possible per alg description above, get out of joint conf + with another CAS. Task should get wakeups from per sk reconcilers because + conf switch is required for advancement; however retries should be sleep + based as well as LSN advancement might be needed, though in happy path + it isn't. To see whether further transition is possible on wakup migration + executor polls safekeepers per the algorithm. CAS creating new conf with only + new members should again insert entries to `safekeeper_timeline_pending_ops` + to switch them there, as well as `exclude` rows to remove timeline from + old members. + +Timeline deletion: just set `deleted_at` on the timeline row and insert +`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by +per sk reconcilers. + +When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops` +for it must be cleared in the same transaction. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets (trying making `cplane_notified_generation` equal `generation`). + #### Dealing with multiple instances of storage_controller Operations described above executed concurrently might create some errors but do not prevent progress, so while we normally don't want to run multiple instances of storage_controller it is fine to have it temporarily, e.g. during redeploy. +To harden against some controller instance creating some work in +`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up +the job per sk reconcilers apart from explicit wakups should scan for work +periodically. It is possible to remove that though if all db updates are +protected with leadership token/term -- then such scans are needed only after +leadership is acquired. + Any interactions with db update in-memory controller state, e.g. if migration request failed because different one is in progress, controller remembers that and tries to finish it. @@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed. We should use Compute <-> safekeeper protocol change to include other (long yearned) modifications: -- send data in network order to make arm work. +- send data in network order without putting whole structs to be arch independent - remove term_start_lsn from AppendRequest - add horizon to TermHistory - add to ProposerGreeting number of connection from this wp to sk diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 2fc95c47c6..767a34bcbc 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -207,11 +207,11 @@ impl RemoteExtSpec { if !self .public_extensions .as_ref() - .is_some_and(|exts| exts.iter().any(|e| e == ext_name)) + .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) && !self .custom_extensions .as_ref() - .is_some_and(|exts| exts.iter().any(|e| e == ext_name)) + .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) { return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); } @@ -414,7 +414,7 @@ mod tests { "public_extensions": ["ext"], "custom_extensions": [], "library_index": { - "ext": "ext" + "extlib": "ext", }, "extension_data": { "ext": { @@ -430,6 +430,12 @@ mod tests { rspec .get_ext("ext", false, "latest", "v17") .expect("Extension should be found"); + + // test library index for the case when library name + // doesn't match the extension name + rspec + .get_ext("extlib", true, "latest", "v17") + .expect("Library should be found"); } #[test] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml new file mode 100644 index 0000000000..d72e4bd012 --- /dev/null +++ b/libs/http-utils/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "http-utils" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +backtrace.workspace = true +bytes.workspace = true +inferno.workspace = true +fail.workspace = true +flate2.workspace = true +hyper0.workspace = true +itertools.workspace = true +jemalloc_pprof.workspace = true +once_cell.workspace = true +pprof.workspace = true +regex.workspace = true +routerify.workspace = true +serde.workspace = true +serde_json.workspace = true +serde_path_to_error.workspace = true +thiserror.workspace = true +tracing.workspace = true +tokio.workspace = true +tokio-util.workspace = true +url.workspace = true +uuid.workspace = true + +# to use tokio channels as streams, this is faster to compile than async_stream +# why is it only here? no other crate should use it, streams are rarely needed. +tokio-stream = { version = "0.1.14" } + +metrics.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/src/http/endpoint.rs b/libs/http-utils/src/endpoint.rs similarity index 99% rename from libs/utils/src/http/endpoint.rs rename to libs/http-utils/src/endpoint.rs index 9f38373ca0..be97b341d1 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,7 +1,6 @@ -use crate::auth::{AuthError, Claims, SwappableJwtAuth}; -use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use crate::http::request::{get_query_param, parse_query_param}; +use crate::error::{api_error_handler, route_error_handler, ApiError}; use crate::pprof; +use crate::request::{get_query_param, parse_query_param}; use ::pprof::protos::Message as _; use ::pprof::ProfilerGuardBuilder; use anyhow::{anyhow, Context}; @@ -19,6 +18,7 @@ use tokio::sync::{mpsc, Mutex, Notify}; use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use std::future::Future; use std::io::Write as _; @@ -718,9 +718,9 @@ pub fn check_permission_with( #[cfg(test)] mod tests { use super::*; - use futures::future::poll_fn; use hyper::service::Service; use routerify::RequestServiceBuilder; + use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; #[tokio::test] diff --git a/libs/utils/src/http/error.rs b/libs/http-utils/src/error.rs similarity index 93% rename from libs/utils/src/http/error.rs rename to libs/http-utils/src/error.rs index 02fc9e3b99..746305caec 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/http-utils/src/error.rs @@ -5,6 +5,8 @@ use std::error::Error as StdError; use thiserror::Error; use tracing::{error, info, warn}; +use utils::auth::AuthError; + #[derive(Debug, Error)] pub enum ApiError { #[error("Bad request: {0:#?}")] @@ -96,6 +98,15 @@ impl ApiError { } } +impl From for ApiError { + fn from(_value: AuthError) -> Self { + // Don't pass on the value of the AuthError as a precautionary measure. + // Being intentionally vague in public error communication hurts debugability + // but it is more secure. + ApiError::Forbidden("JWT authentication error".to_string()) + } +} + #[derive(Serialize, Deserialize)] pub struct HttpErrorBody { pub msg: String, diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs new file mode 100644 index 0000000000..8a1e0c8cf0 --- /dev/null +++ b/libs/http-utils/src/failpoints.rs @@ -0,0 +1,50 @@ +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; + +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because neon was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/http/json.rs b/libs/http-utils/src/json.rs similarity index 100% rename from libs/utils/src/http/json.rs rename to libs/http-utils/src/json.rs diff --git a/libs/utils/src/http/mod.rs b/libs/http-utils/src/lib.rs similarity index 82% rename from libs/utils/src/http/mod.rs rename to libs/http-utils/src/lib.rs index 74ed6bb5b2..ae6a27aaa8 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/http-utils/src/lib.rs @@ -1,8 +1,12 @@ pub mod endpoint; pub mod error; +pub mod failpoints; pub mod json; +pub mod pprof; pub mod request; +extern crate hyper0 as hyper; + /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/pprof.rs b/libs/http-utils/src/pprof.rs similarity index 100% rename from libs/utils/src/pprof.rs rename to libs/http-utils/src/pprof.rs diff --git a/libs/utils/src/http/request.rs b/libs/http-utils/src/request.rs similarity index 100% rename from libs/utils/src/http/request.rs rename to libs/http-utils/src/request.rs diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index a0b5feea94..79f068a47b 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -121,6 +121,7 @@ pub struct ConfigToml { pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, + pub enable_read_path_debugging: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -262,6 +263,11 @@ pub struct TenantConfigToml { /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + /// If true, compact down L0 across all tenant timelines before doing regular compaction. + pub compaction_l0_first: bool, + /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only + /// has an effect if `compaction_l0_first` is `true`. + pub compaction_l0_semaphore: bool, /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification @@ -490,7 +496,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), - timeline_offloading: false, + timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, @@ -510,6 +516,11 @@ impl Default for ConfigToml { } else { GetVectoredConcurrentIo::SidecarTask }, + enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + Some(true) + } else { + None + }, } } } @@ -537,6 +548,8 @@ pub mod tenant_conf_defaults { // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; @@ -586,6 +599,8 @@ impl Default for TenantConfigToml { compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, + compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST, + compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, l0_flush_delay_threshold: None, l0_flush_stall_threshold: None, l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, @@ -616,7 +631,7 @@ impl Default for TenantConfigToml { image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - timeline_offloading: false, + timeline_offloading: true, wal_receiver_protocol_override: None, rel_size_v2_enabled: None, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 19beb37ab3..6dbfbec345 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -464,6 +464,10 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_algorithm: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_first: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_semaphore: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_delay_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_stall_threshold: FieldPatch, @@ -529,6 +533,8 @@ pub struct TenantConfig { pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy pub compaction_algorithm: Option, + pub compaction_l0_first: Option, + pub compaction_l0_semaphore: Option, pub l0_flush_delay_threshold: Option, pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, @@ -567,6 +573,8 @@ impl TenantConfig { mut compaction_threshold, mut compaction_upper_limit, mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, mut l0_flush_wait_upload, @@ -606,6 +614,10 @@ impl TenantConfig { .compaction_upper_limit .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); patch .l0_flush_delay_threshold .apply(&mut l0_flush_delay_threshold); @@ -669,6 +681,8 @@ impl TenantConfig { compaction_threshold, compaction_upper_limit, compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, l0_flush_wait_upload, diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 9524a5149b..77dff4ac99 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -76,7 +76,15 @@ impl Conf { let mut cmd = Command::new(path); cmd.env_clear() .env("LD_LIBRARY_PATH", self.pg_lib_dir()?) - .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?); + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ); Ok(cmd) } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index 2f072354fb..ed54696861 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { .env_clear() .env("LD_LIBRARY_PATH", library_search_path) .env("DYLD_LIBRARY_PATH", library_search_path) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .stdin(std::process::Stdio::null()) // stdout invocation produces the same output every time, we don't need it .stdout(std::process::Stdio::null()) diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index dae141bf77..ff34158c9c 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use crate::{ DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT, - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; /// External backup storage configuration, enough for creating a client for that storage. @@ -45,11 +45,11 @@ impl RemoteStorageKind { impl RemoteStorageConfig { /// Helper to fetch the configured concurrency limit. - pub fn concurrency_limit(&self) -> Option { + pub fn concurrency_limit(&self) -> usize { match &self.storage { - RemoteStorageKind::LocalFs { .. } => None, - RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()), - RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()), + RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, + RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(), + RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(), } } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 7a864151ec..69b522d63e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// Here, a limit of max 20k concurrent connections was noted. /// pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; +/// Set this limit analogously to the S3 limit. +/// +/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds +/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the +/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks. +pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index edb451a02c..0f10300959 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -21,23 +21,17 @@ bytes.workspace = true camino.workspace = true chrono.workspace = true diatomic-waker.workspace = true -flate2.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -hyper0 = { workspace = true, features = ["full"] } inferno.workspace = true -itertools.workspace = true fail.workspace = true futures = { workspace = true } -jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true -pprof.workspace = true regex.workspace = true -routerify.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true @@ -54,8 +48,6 @@ rand.workspace = true scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true -url.workspace = true -uuid.workspace = true walkdir.workspace = true pq_proto.workspace = true @@ -64,12 +56,6 @@ metrics.workspace = true const_format.workspace = true -# to use tokio channels as streams, this is faster to compile than async_stream -# why is it only here? no other crate should use it, streams are rarely needed. -tokio-stream = { version = "0.1.14" } - -serde_path_to_error.workspace = true - [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index a8615c2337..f394d4c58d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -39,7 +39,7 @@ function initdb_with_args { ;; esac - eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}" + eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}" } rm -fr "$DATA_DIR" diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index f7acc61ac1..4bfd0ab055 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -10,7 +10,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::{http::error::ApiError, id::TenantId}; +use crate::id::TenantId; /// Algorithm to use. We require EdDSA. const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; @@ -90,15 +90,6 @@ impl Display for AuthError { } } -impl From for ApiError { - fn from(_value: AuthError) -> Self { - // Don't pass on the value of the AuthError as a precautionary measure. - // Being intentionally vague in public error communication hurts debugability - // but it is more secure. - ApiError::Forbidden("JWT authentication error".to_string()) - } -} - pub struct JwtAuth { decoding_keys: Vec, validation: Validation, diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index 096c7e5854..e6503fe377 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::time::Duration; use futures::Future; use tokio_util::sync::CancellationToken; @@ -29,6 +30,11 @@ pub async fn exponential_backoff( } } +pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration { + let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds); + Duration::from_secs_f64(seconds) +} + pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 272c6ebb26..fc998ad9a9 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -1,13 +1,6 @@ //! Failpoint support code shared between pageserver and safekeepers. -use crate::http::{ - error::ApiError, - json::{json_request, json_response}, -}; -use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::*; /// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. @@ -184,45 +177,3 @@ fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } - -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - -/// Configure failpoints through http. -pub async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot manage failpoints because neon was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 1fb18e9e9a..820ff2d5ea 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -2,8 +2,6 @@ //! between other crates in this repository. #![deny(clippy::undocumented_unsafe_blocks)] -extern crate hyper0 as hyper; - pub mod backoff; /// `Lsn` type implements common tasks on Log Sequence Numbers @@ -33,9 +31,6 @@ pub mod shard; mod hex; pub use hex::Hex; -// http endpoint utils -pub mod http; - // definition of the Generation type for pageserver attachment APIs pub mod generation; @@ -96,8 +91,6 @@ pub mod circuit_breaker; pub mod try_rcu; -pub mod pprof; - pub mod guard_arc_swap; // Re-export used in macro. Avoids adding git-version as dep in target crates. diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 753f05b6fd..4a6069294d 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames}; /// Logs a critical error, similarly to `tracing::error!`. This will: /// /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. +/// * Trigger a pageable alert (via the metric below). /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". -/// * Trigger a pageable alert (via the metric above). /// * In debug builds, panic the process. +/// +/// When including errors in the message, please use {err:?} to include the error cause and original +/// backtrace. #[macro_export] macro_rules! critical { - ($($arg:tt)*) => { + ($($arg:tt)*) => {{ if cfg!(debug_assertions) { panic!($($arg)*); } $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); - }; + }}; } #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 6e4eaa0efd..41ac3b69b8 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -79,6 +79,7 @@ pq_proto.workspace = true remote_storage.workspace = true storage_broker.workspace = true tenant_size_model.workspace = true +http-utils.workspace = true utils.workspace = true workspace_hack.workspace = true reqwest.workspace = true diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index f582d307a7..db77a395e0 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -11,6 +11,7 @@ testing = [ "pageserver_api/testing" ] pageserver_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } +http-utils.workspace = true utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 0359bfcd0b..da7ec5abce 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,11 +1,12 @@ use std::{collections::HashMap, error::Error as _}; use bytes::Bytes; -use detach_ancestor::AncestorDetached; -use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; + +use detach_ancestor::AncestorDetached; +use http_utils::error::HttpErrorBody; +use pageserver_api::{models::*, shard::TenantShardId}; use utils::{ - http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, }; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5764728505..fa098e9364 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -592,7 +592,7 @@ fn start_pageserver( let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper0::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown({ diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ce480c70a0..c5368f6806 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -193,6 +193,10 @@ pub struct PageServerConf { pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, + + /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer + /// files read. + pub enable_read_path_debugging: bool, } /// Token for authentication to safekeepers @@ -355,6 +359,7 @@ impl PageServerConf { wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, + enable_read_path_debugging, } = config_toml; let mut conf = PageServerConf { @@ -440,6 +445,7 @@ impl PageServerConf { .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), + enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 1d508f5fe9..a2395b0dca 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -8,7 +8,6 @@ use std::time::Duration; use crate::controller_upcall_client::ControlPlaneGenerationsApi; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::virtual_file::MaybeFatalIo; @@ -463,45 +462,18 @@ impl DeletionQueueClient { /// /// The `current_generation` is the generation of this pageserver's current attachment. The /// generations in `layers` are the generations in which those layers were written. - pub(crate) async fn push_layers( + pub(crate) fn push_layers( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { - if current_generation.is_none() { - debug!("Enqueuing deletions in legacy mode, skipping queue"); + // None generations are not valid for attached tenants: they must always be attached in + // a known generation. None generations are still permitted for layers in the index because + // they may be historical. + assert!(!current_generation.is_none()); - let mut layer_paths = Vec::new(); - for (layer, meta) in layers { - layer_paths.push(remote_layer_path( - &tenant_shard_id.tenant_id, - &timeline_id, - meta.shard, - &layer, - meta.generation, - )); - } - self.push_immediate(layer_paths).await?; - return self.flush_immediate().await; - } - - self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers) - } - - /// When a Tenant has a generation, push_layers is always synchronous because - /// the ListValidator channel is an unbounded channel. - /// - /// This can be merged into push_layers when we remove the Generation-less mode - /// support (``) - pub(crate) fn push_layers_sync( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - current_generation: Generation, - layers: Vec<(LayerName, LayerFileMetadata)>, - ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted .inc_by(layers.len() as u64); @@ -957,14 +929,12 @@ mod test { // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) info!("Pushing"); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(layer_file_name_1.clone(), layer_metadata)].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(layer_file_name_1.clone(), layer_metadata)].to_vec(), + )?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); assert_local_files(&[], &deletion_prefix); @@ -1017,14 +987,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - stale_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + stale_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a stale generation: it should have failed validation tracing::debug!("Flushing..."); @@ -1032,14 +1000,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - latest_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + latest_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a fresh generation: it should have passed validation tracing::debug!("Flushing..."); @@ -1074,28 +1040,24 @@ mod test { // generation gets that treatment) let remote_layer_file_name_historical = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation.previous(), - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation.previous(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // Inject a deletion in the generation before generation_now: after restart, // this deletion should get executed, because we execute deletions in the // immediately previous generation on the same node. let remote_layer_file_name_previous = ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), + )?; client.flush().await?; assert_remote_files( @@ -1139,6 +1101,7 @@ pub(crate) mod mock { use tracing::info; use super::*; + use crate::tenant::remote_timeline_client::remote_layer_path; use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index ca44fbe6ae..738a783813 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -61,6 +61,7 @@ use crate::{ remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, + tasks::sleep_random, }, CancellableTask, DiskUsageEvictionTask, }; @@ -210,14 +211,8 @@ async fn disk_usage_eviction_task( info!("disk usage based eviction task finishing"); }; - use crate::tenant::tasks::random_init_delay; - { - if random_init_delay(task_config.period, &cancel) - .await - .is_err() - { - return; - } + if sleep_random(task_config.period, &cancel).await.is_err() { + return; } let mut iteration_no = 0; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 94f7510a4a..bd196621c1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -13,6 +13,12 @@ use enumset::EnumSet; use futures::future::join_all; use futures::StreamExt; use futures::TryFutureExt; +use http_utils::endpoint::{ + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::failpoints::failpoints_handler; +use http_utils::request::must_parse_query_param; +use http_utils::request::{get_request_param, must_get_query_param, parse_query_param}; use humantime::format_rfc3339; use hyper::header; use hyper::StatusCode; @@ -60,13 +66,6 @@ use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::auth::JwtAuth; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use utils::http::request::must_parse_query_param; -use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -104,6 +103,13 @@ use crate::tenant::OffloadedTimeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; +use http_utils::{ + endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, + error::{ApiError, HttpErrorBody}, + json::{json_request, json_request_maybe, json_response}, + request::parse_request_param, + RequestExt, RouterBuilder, +}; use pageserver_api::models::{ StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, TimelineInfo, @@ -111,13 +117,6 @@ use pageserver_api::models::{ use utils::{ auth::SwappableJwtAuth, generation::Generation, - http::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, - }, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -561,7 +560,7 @@ async fn reload_auth_validation_keys_handler( let key_path = config.auth_validation_public_key_path.as_ref().unwrap(); info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}"); - match JwtAuth::from_key_path(key_path) { + match utils::auth::JwtAuth::from_key_path(key_path) { Ok(new_auth) => { shared_auth.swap(new_auth); json_response(StatusCode::OK, ()) @@ -2152,6 +2151,7 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6ab1178a7b..983a3079e4 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; -use enum_map::EnumMap; +use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, @@ -104,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); -// Buckets for background operations like compaction, GC, size calculation +// Buckets for background operation duration in seconds, like compaction, GC, size calculation. const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { @@ -236,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| GetVectoredLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { - let task_kind = ::from_usize(task_kind_idx); + let task_kind = TaskKind::from_usize(task_kind_idx); if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); @@ -259,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { ScanLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { - let task_kind = ::from_usize(task_kind_idx); + let task_kind = TaskKind::from_usize(task_kind_idx); if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); @@ -300,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { map: EnumMap::from_array(std::array::from_fn(|task_kind| { - let task_kind = ::from_usize(task_kind); + let task_kind = TaskKind::from_usize(task_kind); let task_kind: &'static str = task_kind.into(); EnumMap::from_array(std::array::from_fn(|content_kind| { - let content_kind = ::from_usize(content_kind); + let content_kind = PageContentKind::from_usize(content_kind); let content_kind: &'static str = content_kind.into(); PageCacheMetricsForTaskKind { read_accesses_immutable: { @@ -1366,10 +1366,7 @@ impl SmgrOpTimer { /// The first callers receives Some, subsequent ones None. /// /// See [`SmgrOpTimerState`] for more context. - pub(crate) fn observe_execution_end_flush_start( - &mut self, - at: Instant, - ) -> Option { + pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option { // NB: unlike the other observe_* methods, this one take()s. #[allow(clippy::question_mark)] // maintain similar code pattern. let Some(mut inner) = self.0.take() else { @@ -1403,7 +1400,6 @@ impl SmgrOpTimer { .. } = inner; Some(SmgrOpFlushInProgress { - flush_started_at: at, global_micros: global_flush_in_progress_micros, per_timeline_micros: per_timeline_flush_in_progress_micros, }) @@ -1419,7 +1415,6 @@ impl SmgrOpTimer { /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there, /// and remove this struct from the code base. pub(crate) struct SmgrOpFlushInProgress { - flush_started_at: Instant, global_micros: IntCounter, per_timeline_micros: IntCounter, } @@ -1438,12 +1433,13 @@ impl Drop for SmgrOpTimer { self.observe_throttle_start(now); self.observe_throttle_done(ThrottleResult::NotThrottled { end: now }); self.observe_execution_start(now); - self.observe_execution_end_flush_start(now); + let maybe_flush_timer = self.observe_execution_end(now); + drop(maybe_flush_timer); } } impl SmgrOpFlushInProgress { - pub(crate) async fn measure(mut self, mut fut: Fut) -> O + pub(crate) async fn measure(self, mut started_at: Instant, mut fut: Fut) -> O where Fut: std::future::Future, { @@ -1455,12 +1451,12 @@ impl SmgrOpFlushInProgress { let mut observe_guard = scopeguard::guard( || { let now = Instant::now(); - let elapsed = now - self.flush_started_at; + let elapsed = now - started_at; self.global_micros .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); self.per_timeline_micros .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.flush_started_at = now; + started_at = now; }, |mut observe| { observe(); @@ -1913,7 +1909,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy ComputeCommandCounters { map: EnumMap::from_array(std::array::from_fn(|i| { - let command = ::from_usize(i); + let command = ComputeCommandKind::from_usize(i); let command_str: &'static str = command.into(); inner.with_label_values(&[command_str]) })), @@ -2213,11 +2209,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { pub struct BackgroundLoopSemaphoreMetrics { counters: EnumMap, - durations: EnumMap, + durations: EnumMap, + waiting_tasks: EnumMap, + running_tasks: EnumMap, } -pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = Lazy::new( - || { +pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = + Lazy::new(|| { let counters = register_int_counter_pair_vec!( "pageserver_background_loop_semaphore_wait_start_count", "Counter for background loop concurrency-limiting semaphore acquire calls started", @@ -2227,45 +2225,101 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy::from_usize(i); + counters: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); counters.with_label_values(&[kind.into()]) })), - durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| { - let kind = ::from_usize(i); + durations: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); durations.with_label_values(&[kind.into()]) })), + waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); + waiting_tasks.with_label_values(&[kind.into()]) + })), + running_tasks: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); + running_tasks.with_label_values(&[kind.into()]) + })), } - }, -); + }); impl BackgroundLoopSemaphoreMetrics { - pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ { - struct Record<'a> { - metrics: &'a BackgroundLoopSemaphoreMetrics, - task: BackgroundLoopKind, - _counter_guard: metrics::IntCounterPairGuard, - start: Instant, - } - impl Drop for Record<'_> { - fn drop(&mut self) { - let elapsed = self.start.elapsed().as_secs_f64(); - self.metrics.durations[self.task].inc_by(elapsed); - } - } - Record { - metrics: self, + /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the + /// semaphore is acquired, and drop it when the task completes or is cancelled. + pub(crate) fn record( + &self, + task: BackgroundLoopKind, + ) -> BackgroundLoopSemaphoreMetricsRecorder { + BackgroundLoopSemaphoreMetricsRecorder::start(self, task) + } +} + +/// Records metrics for a background task. +pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> { + metrics: &'a BackgroundLoopSemaphoreMetrics, + task: BackgroundLoopKind, + start: Instant, + wait_counter_guard: Option, +} + +impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> { + /// Starts recording semaphore metrics, by recording wait time and incrementing + /// `wait_start_count` and `waiting_tasks`. + fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self { + metrics.waiting_tasks[task].inc(); + Self { + metrics, task, - _counter_guard: self.counters[task].guard(), start: Instant::now(), + wait_counter_guard: Some(metrics.counters[task].guard()), + } + } + + /// Signals that the semaphore has been acquired, and updates relevant metrics. + pub fn acquired(&mut self) -> Duration { + let waited = self.start.elapsed(); + self.wait_counter_guard.take().expect("already acquired"); + self.metrics.durations[self.task].observe(waited.as_secs_f64()); + self.metrics.waiting_tasks[self.task].dec(); + self.metrics.running_tasks[self.task].inc(); + waited + } +} + +impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> { + /// The task either completed or was cancelled. + fn drop(&mut self) { + if self.wait_counter_guard.take().is_some() { + // Waiting. + self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64()); + self.metrics.waiting_tasks[self.task].dec(); + } else { + // Running. + self.metrics.running_tasks[self.task].dec(); } } } @@ -2514,7 +2568,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = pub(crate) struct WalRedoProcessCounters { pub(crate) started: IntCounter, - pub(crate) killed_by_cause: enum_map::EnumMap, + pub(crate) killed_by_cause: EnumMap, pub(crate) active_stderr_logger_tasks_started: IntCounter, pub(crate) active_stderr_logger_tasks_finished: IntCounter, } @@ -2556,7 +2610,7 @@ impl Default for WalRedoProcessCounters { Self { started, killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| { - let cause = ::from_usize(i); + let cause = WalRedoKillCause::from_usize(i); let cause_str: &'static str = cause.into(); killed.with_label_values(&[cause_str]) })), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e103338c7c..972dad34d4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -489,7 +489,6 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - set_tracing_field_shard_id(&timeline); Ok(timeline) } } @@ -774,11 +773,11 @@ impl PageServerHandler { let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelExists, @@ -793,11 +792,10 @@ impl PageServerHandler { } } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetRelSize, @@ -812,11 +810,10 @@ impl PageServerHandler { } } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetDbSize, @@ -831,11 +828,10 @@ impl PageServerHandler { } } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, metrics::SmgrQueryType::GetSlruSegment, @@ -850,12 +846,20 @@ impl PageServerHandler { } } PagestreamFeMessage::GetPage(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn); + // avoid a somewhat costly Span::record() by constructing the entire span in one go. + macro_rules! mkspan { + (before shard routing) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + }}; + ($shard_id:expr) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + }}; + } macro_rules! respond_error { - ($error:expr) => {{ + ($span:expr, $error:expr) => {{ let error = BatchedFeMessage::RespondError { - span, + span: $span, error: BatchedPageStreamError { req: req.hdr, err: $error, @@ -868,27 +872,35 @@ impl PageServerHandler { let key = rel_block_to_key(req.rel, req.blkno); let shard = match timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) - .instrument(span.clone()) // sets `shard_id` field .await { Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return respond_error!(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into() - )); - } Err(e) => { - return respond_error!(e.into()); + let span = mkspan!(before shard routing); + match e { + GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return respond_error!( + span, + PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + ) + ); + } + e => { + return respond_error!(span, e.into()); + } + } } }; + let span = mkspan!(shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( &shard, @@ -910,7 +922,7 @@ impl PageServerHandler { { Ok(lsn) => lsn, Err(e) => { - return respond_error!(e); + return respond_error!(span, e); } }; BatchedFeMessage::GetPage { @@ -922,11 +934,10 @@ impl PageServerHandler { } #[cfg(feature = "testing")] PagestreamFeMessage::Test(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_test_request"); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) .await?; @@ -1063,7 +1074,7 @@ impl PageServerHandler { }; // invoke handler function - let (handler_results, span): ( + let (mut handler_results, span): ( Vec>, _, ) = match batch { @@ -1190,11 +1201,49 @@ impl PageServerHandler { } }; + // We purposefully don't count flush time into the smgr operation timer. + // + // The reason is that current compute client will not perform protocol processing + // if the postgres backend process is doing things other than `->smgr_read()`. + // This is especially the case for prefetch. + // + // If the compute doesn't read from the connection, eventually TCP will backpressure + // all the way into our flush call below. + // + // The timer's underlying metric is used for a storage-internal latency SLO and + // we don't want to include latency in it that we can't control. + // And as pointed out above, in this case, we don't control the time that flush will take. + // + // We put each response in the batch onto the wire in a separate pgb_writer.flush() + // call, which (all unmeasured) adds syscall overhead but reduces time to first byte + // and avoids building up a "giant" contiguous userspace buffer to hold the entire response. + // TODO: vectored socket IO would be great, but pgb_writer doesn't support that. + let flush_timers = { + let flushing_start_time = Instant::now(); + let mut flush_timers = Vec::with_capacity(handler_results.len()); + for handler_result in &mut handler_results { + let flush_timer = match handler_result { + Ok((_, timer)) => Some( + timer + .observe_execution_end(flushing_start_time) + .expect("we are the first caller"), + ), + Err(_) => { + // TODO: measure errors + None + } + }; + flush_timers.push(flush_timer); + } + assert_eq!(flush_timers.len(), handler_results.len()); + flush_timers + }; + // Map handler result to protocol behavior. // Some handler errors cause exit from pagestream protocol. // Other handler errors are sent back as an error message and we stay in pagestream protocol. - for handler_result in handler_results { - let (response_msg, timer) = match handler_result { + for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) { + let response_msg = match handler_result { Err(e) => match &e.err { PageStreamError::Shutdown => { // If we fail to fulfil a request during shutdown, which may be _because_ of @@ -1218,16 +1267,14 @@ impl PageServerHandler { span.in_scope(|| { error!("error reading relation or page version: {full:#}") }); - ( - PagestreamBeMessage::Error(PagestreamErrorResponse { - req: e.req, - message: e.err.to_string(), - }), - None, // TODO: measure errors - ) + + PagestreamBeMessage::Error(PagestreamErrorResponse { + req: e.req, + message: e.err.to_string(), + }) } }, - Ok((response_msg, timer)) => (response_msg, Some(timer)), + Ok((response_msg, _op_timer_already_observed)) => response_msg, }; // @@ -1238,30 +1285,12 @@ impl PageServerHandler { &response_msg.serialize(protocol_version), ))?; - // We purposefully don't count flush time into the timer. - // - // The reason is that current compute client will not perform protocol processing - // if the postgres backend process is doing things other than `->smgr_read()`. - // This is especially the case for prefetch. - // - // If the compute doesn't read from the connection, eventually TCP will backpressure - // all the way into our flush call below. - // - // The timer's underlying metric is used for a storage-internal latency SLO and - // we don't want to include latency in it that we can't control. - // And as pointed out above, in this case, we don't control the time that flush will take. - let flushing_timer = timer.map(|mut timer| { - timer - .observe_execution_end_flush_start(Instant::now()) - .expect("we are the first caller") - }); - // what we want to do let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { Some(flushing_timer) => { - futures::future::Either::Left(flushing_timer.measure(flush_fut)) + futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut)) } None => futures::future::Either::Right(flush_fut), }; @@ -1280,8 +1309,6 @@ impl PageServerHandler { } Ok(()) } - // and log the info! line inside the request span - .instrument(span.clone()) .await?; } Ok(()) @@ -1342,7 +1369,7 @@ impl PageServerHandler { .take() .expect("implementation error: timeline_handles should not be locked"); - let request_span = info_span!("request", shard_id = tracing::field::Empty); + let request_span = info_span!("request"); let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { PageServicePipeliningConfig::Pipelined(pipelining_config) => { self.handle_pagerequests_pipelined( @@ -1692,7 +1719,7 @@ impl PageServerHandler { // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); - if !gc_info.leases.contains_key(&request_lsn) { + if !gc_info.lsn_covered_by_lease(request_lsn) { return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", @@ -2036,6 +2063,13 @@ impl PageServerHandler { .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + set_tracing_field_shard_id(&timeline); + + if timeline.is_archived() == Some(true) { + // TODO after a grace period, turn this log line into a hard error + tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."); + //return Err(QueryError::NotFound("timeline is archived".into())) + } let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index dcbf62b56c..00f332d797 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -612,11 +612,18 @@ impl Timeline { pausable_failpoint!("find-lsn-for-timestamp-pausable"); let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); + let gc_cutoff_planned = { + let gc_info = self.gc_info.read().unwrap(); + gc_info.min_cutoff() + }; + // Usually the planned cutoff is newer than the cutoff of the last gc run, + // but let's be defensive. + let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be // on the safe side. - let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn()); + let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn()); let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 622738022a..cc93a06ccd 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -328,8 +328,8 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, - // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) - IngestHousekeeping, + // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.). + TenantHousekeeping, /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c1b408ed72..4c65991e45 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,6 +20,7 @@ use chrono::NaiveDateTime; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; +use itertools::Itertools as _; use pageserver_api::models; use pageserver_api::models::CompactInfoResponse; use pageserver_api::models::LsnLease; @@ -51,10 +52,13 @@ use timeline::compaction::GcCompactionQueue; use timeline::import_pgdata; use timeline::offload::offload_timeline; use timeline::offload::OffloadError; +use timeline::CompactFlags; use timeline::CompactOptions; +use timeline::CompactionError; use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; +use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; @@ -349,6 +353,9 @@ pub struct Tenant { /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, + /// Signals the tenant compaction loop that there is L0 compaction work to be done. + pub(crate) l0_compaction_trigger: Arc, + /// Scheduled gc-compaction tasks. scheduled_compaction_tasks: std::sync::Mutex>>, @@ -1690,12 +1697,7 @@ impl Tenant { timeline_id, index_part, remote_metadata, - TimelineResources { - remote_client, - pagestream_throttle: self.pagestream_throttle.clone(), - pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, + self.get_timeline_resources_for(remote_client), LoadTimelineCause::Attach, ctx, ) @@ -2898,150 +2900,194 @@ impl Tenant { .await } - /// Perform one compaction iteration. - /// This function is periodically called by compactor task. - /// Also it can be explicitly requested per timeline through page server - /// api's 'compact' command. + /// Performs one compaction iteration. Called periodically from the compaction loop. Returns + /// whether another compaction is needed, if we still have pending work or if we yield for + /// immediate L0 compaction. /// - /// Returns whether we have pending compaction task. + /// Compaction can also be explicitly requested for a timeline via the HTTP API. async fn compaction_iteration( self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { - // Don't start doing work during shutdown, or when broken, we do not need those in the logs + ) -> Result { + // Don't compact inactive tenants. if !self.is_active() { - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } - { - let conf = self.tenant_conf.load(); - - // Note that compaction usually requires deletions, but we don't respect - // may_delete_layers_hint here: that is because tenants in AttachedMulti - // should proceed with compaction even if they can't do deletion, to avoid - // accumulating dangerously deep stacks of L0 layers. Deletions will be - // enqueued inside RemoteTimelineClient, and executed layer if/when we transition - // to AttachedSingle state. - if !conf.location.may_upload_layers_hint() { - info!("Skipping compaction in location state {:?}", conf.location); - return Ok(CompactionOutcome::Done); - } + // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`, + // since we need to compact L0 even in AttachedMulti to bound read amplification. + let location = self.tenant_conf.load().location; + if !location.may_upload_layers_hint() { + info!("skipping compaction in location state {location:?}"); + return Ok(CompactionOutcome::Skipped); } - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines_to_compact_or_offload; - { - let timelines = self.timelines.lock().unwrap(); - timelines_to_compact_or_offload = timelines - .iter() - .filter_map(|(timeline_id, timeline)| { - let (is_active, (can_offload, _)) = - (timeline.is_active(), timeline.can_offload()); - let has_no_unoffloaded_children = { - !timelines - .iter() - .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) - }; - let config_allows_offload = self.conf.timeline_offloading - || self - .tenant_conf - .load() - .tenant_conf - .timeline_offloading - .unwrap_or_default(); - let can_offload = - can_offload && has_no_unoffloaded_children && config_allows_offload; - if (is_active, can_offload) == (false, false) { - None - } else { - Some((*timeline_id, timeline.clone(), (is_active, can_offload))) - } - }) - .collect::>(); - drop(timelines); - } - - // Before doing any I/O work, check our circuit breaker + // Don't compact if the circuit breaker is tripped. if self.compaction_circuit_breaker.lock().unwrap().is_broken() { - info!("Skipping compaction due to previous failures"); - return Ok(CompactionOutcome::Done); + info!("skipping compaction due to previous failures"); + return Ok(CompactionOutcome::Skipped); } - let mut has_pending_task = false; + // Collect all timelines to compact, along with offload instructions and L0 counts. + let mut compact: Vec> = Vec::new(); + let mut offload: HashSet = HashSet::new(); + let mut l0_counts: HashMap = HashMap::new(); - for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload { - // pending_task_left == None: cannot compact, maybe still pending tasks - // pending_task_left == Some(Pending): compaction task left - // pending_task_left == Some(Done): no compaction task left - let pending_task_left = if *can_compact { - let compaction_outcome = timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await - .inspect_err(|e| match e { - timeline::CompactionError::ShuttingDown => (), - timeline::CompactionError::Offload(_) => { - // Failures to offload timelines do not trip the circuit breaker, because - // they do not do lots of writes the way compaction itself does: it is cheap - // to retry, and it would be bad to stop all compaction because of an issue with offloading. - } - timeline::CompactionError::Other(e) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, e); - } - })?; - if let CompactionOutcome::Pending = compaction_outcome { - Some(CompactionOutcome::Pending) - } else { - let queue = { - let guard = self.scheduled_compaction_tasks.lock().unwrap(); - guard.get(timeline_id).cloned() - }; - if let Some(queue) = queue { - let outcome = queue - .iteration(cancel, ctx, &self.gc_block, timeline) - .await?; - Some(outcome) - } else { - Some(CompactionOutcome::Done) - } + let offload_enabled = self.get_timeline_offloading_enabled(); + let timelines = self.timelines.lock().unwrap(); + for (&timeline_id, timeline) in timelines.iter() { + // Skip inactive timelines. + if !timeline.is_active() { + continue; } - } else { - None - }; - has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending); - if pending_task_left == Some(CompactionOutcome::Done) && *can_offload { - pausable_failpoint!("before-timeline-auto-offload"); - match offload_timeline(self, timeline) - .instrument(info_span!("offload_timeline", %timeline_id)) - .await - { - Err(OffloadError::NotArchived) => { - // Ignore this, we likely raced with unarchival - Ok(()) - } - other => other, - }?; + + // Schedule the timeline for compaction. + compact.push(timeline.clone()); + + // Schedule the timeline for offloading if eligible. + let can_offload = offload_enabled + && timeline.can_offload().0 + && !timelines + .iter() + .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id)); + if can_offload { + offload.insert(timeline_id); + } + } + } // release timelines lock + + for timeline in &compact { + // Collect L0 counts. Can't await while holding lock above. + if let Ok(lm) = timeline.layers.read().await.layer_map() { + l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len()); } } + // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to + // bound read amplification. + // + // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC + // compaction and offloading. We leave that as a potential problem to solve later. Consider + // splitting L0 and image/GC compaction to separate background jobs. + if self.get_compaction_l0_first() { + let compaction_threshold = self.get_compaction_threshold(); + let compact_l0 = compact + .iter() + .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0))) + .filter(|&(_, l0)| l0 >= compaction_threshold) + .sorted_by_key(|&(_, l0)| l0) + .rev() + .map(|(tli, _)| tli.clone()) + .collect_vec(); + + let mut has_pending_l0 = false; + for timeline in compact_l0 { + let outcome = timeline + .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) + .await + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending_l0 = true, + CompactionOutcome::YieldForL0 => has_pending_l0 = true, + } + } + if has_pending_l0 { + return Ok(CompactionOutcome::YieldForL0); // do another pass + } + } + + // Pass 2: image compaction and timeline offloading. If any timelines have accumulated + // more L0 layers, they may also be compacted here. + // + // NB: image compaction may yield if there is pending L0 compaction. + // + // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a + // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`. + // We leave this for a later PR. + // + // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, + // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. + let mut has_pending = false; + for timeline in compact { + if !timeline.is_active() { + continue; + } + + let mut outcome = timeline + .compact(cancel, EnumSet::default(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) + .await + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + + // If we're done compacting, check the scheduled GC compaction queue for more work. + if outcome == CompactionOutcome::Done { + let queue = self + .scheduled_compaction_tasks + .lock() + .unwrap() + .get(&timeline.timeline_id) + .cloned(); + if let Some(queue) = queue { + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .await?; + } + } + + // If we're done compacting, offload the timeline if requested. + if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) { + pausable_failpoint!("before-timeline-auto-offload"); + offload_timeline(self, &timeline) + .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id)) + .await + .or_else(|err| match err { + // Ignore this, we likely raced with unarchival. + OffloadError::NotArchived => Ok(()), + err => Err(err), + })?; + } + + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending = true, + // This mostly makes sense when the L0-only pass above is enabled, since there's + // otherwise no guarantee that we'll start with the timeline that has high L0. + CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0), + } + } + + // Success! Untrip the breaker if necessary. self.compaction_circuit_breaker .lock() .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(if has_pending_task { - CompactionOutcome::Pending - } else { - CompactionOutcome::Done - }) + match has_pending { + true => Ok(CompactionOutcome::Pending), + false => Ok(CompactionOutcome::Done), + } + } + + /// Trips the compaction circuit breaker if appropriate. + pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { + match err { + CompactionError::ShuttingDown => (), + // Offload failures don't trip the circuit breaker, since they're cheap to retry and + // shouldn't block compaction. + CompactionError::Offload(_) => {} + CompactionError::Other(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } + } } /// Cancel scheduled compaction tasks @@ -3088,32 +3134,28 @@ impl Tenant { Ok(rx) } - // Call through to all timelines to freeze ephemeral layers if needed. Usually - // this happens during ingest: this background housekeeping is for freezing layers - // that are open but haven't been written to for some time. - async fn ingest_housekeeping(&self) { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines = { - self.timelines - .lock() - .unwrap() - .values() - .filter_map(|timeline| { - if timeline.is_active() { - Some(timeline.clone()) - } else { - None - } - }) - .collect::>() - }; + /// Performs periodic housekeeping, via the tenant housekeeping background task. + async fn housekeeping(&self) { + // Call through to all timelines to freeze ephemeral layers as needed. This usually happens + // during ingest, but we don't want idle timelines to hold open layers for too long. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tli| tli.is_active()) + .cloned() + .collect_vec(); - for timeline in &timelines { + for timeline in timelines { timeline.maybe_freeze_ephemeral_layer().await; } + + // Shut down walredo if idle. + const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180); + if let Some(ref walredo_mgr) = self.walredo_mgr { + walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); + } } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -3823,6 +3865,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -3877,6 +3926,16 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } + pub fn get_timeline_offloading_enabled(&self) -> bool { + if self.conf.timeline_offloading { + return true; + } + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .timeline_offloading + .unwrap_or(self.conf.default_tenant_conf.timeline_offloading) + } + /// Generate an up-to-date TenantManifest based on the state of this Tenant. fn build_tenant_manifest(&self) -> TenantManifest { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); @@ -4115,6 +4174,7 @@ impl Tenant { // use an extremely long backoff. Some(Duration::from_secs(3600 * 24)), )), + l0_compaction_trigger: Arc::new(Notify::new()), scheduled_compaction_tasks: Mutex::new(Default::default()), activate_now_sem: tokio::sync::Semaphore::new(0), attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), @@ -4642,22 +4702,26 @@ impl Tenant { // check against last actual 'latest_gc_cutoff' first let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - )) - .map_err(CreateTimelineError::AncestorLsn)?; - - // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = gc_info.min_cutoff(); - if start_lsn < cutoff { - return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - ))); + let planned_cutoff = gc_info.min_cutoff(); + if gc_info.lsn_covered_by_lease(start_lsn) { + tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn); + } else { + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + )) + .map_err(CreateTimelineError::AncestorLsn)?; + + // and then the planned GC cutoff + if start_lsn < planned_cutoff { + return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( + "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}" + ))); + } } } @@ -5019,12 +5083,19 @@ impl Tenant { ) } - /// Call this before constructing a timeline, to build its required structures + /// Builds required resources for a new timeline. fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { + let remote_client = self.build_timeline_remote_client(timeline_id); + self.get_timeline_resources_for(remote_client) + } + + /// Builds timeline resources for the given remote client. + fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources { TimelineResources { - remote_client: self.build_timeline_remote_client(timeline_id), + remote_client, pagestream_throttle: self.pagestream_throttle.clone(), pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), + l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -5470,6 +5541,8 @@ pub(crate) mod harness { compaction_threshold: Some(tenant_conf.compaction_threshold), compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), compaction_algorithm: Some(tenant_conf.compaction_algorithm), + compaction_l0_first: Some(tenant_conf.compaction_l0_first), + compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore), l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload), @@ -7697,6 +7770,18 @@ mod tests { } tline.freeze_and_flush().await?; + // Force layers to L1 + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceL0Compaction); + flags + }, + &ctx, + ) + .await?; if iter % 5 == 0 { let (_, before_delta_file_accessed) = @@ -7709,6 +7794,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::ForceL0Compaction); flags }, &ctx, @@ -8155,6 +8241,8 @@ mod tests { let cancel = CancellationToken::new(); + // Image layer creation happens on the disk_consistent_lsn so we need to force set it now. + tline.force_set_disk_consistent_lsn(Lsn(0x40)); tline .compact( &cancel, @@ -8168,8 +8256,7 @@ mod tests { ) .await .unwrap(); - - // Image layers are created at last_record_lsn + // Image layers are created at repartition LSN let images = tline .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 972837dc44..7fdfd736ad 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -285,6 +285,14 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_first: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_semaphore: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub l0_flush_delay_threshold: Option, @@ -416,6 +424,12 @@ impl TenantConfOpt { .as_ref() .unwrap_or(&global_conf.compaction_algorithm) .clone(), + compaction_l0_first: self + .compaction_l0_first + .unwrap_or(global_conf.compaction_l0_first), + compaction_l0_semaphore: self + .compaction_l0_semaphore + .unwrap_or(global_conf.compaction_l0_semaphore), l0_flush_delay_threshold: self .l0_flush_delay_threshold .or(global_conf.l0_flush_delay_threshold), @@ -466,7 +480,7 @@ impl TenantConfOpt { .lsn_lease_length_for_ts .unwrap_or(global_conf.lsn_lease_length_for_ts), timeline_offloading: self - .lazy_slru_download + .timeline_offloading .unwrap_or(global_conf.timeline_offloading), wal_receiver_protocol_override: self .wal_receiver_protocol_override @@ -493,6 +507,8 @@ impl TenantConfOpt { mut compaction_threshold, mut compaction_upper_limit, mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, mut l0_flush_wait_upload, @@ -538,6 +554,10 @@ impl TenantConfOpt { .compaction_upper_limit .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); patch .l0_flush_delay_threshold .apply(&mut l0_flush_delay_threshold); @@ -619,6 +639,8 @@ impl TenantConfOpt { compaction_threshold, compaction_upper_limit, compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, l0_flush_wait_upload, @@ -681,6 +703,8 @@ impl From for models::TenantConfig { compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, + compaction_l0_first: value.compaction_l0_first, + compaction_l0_semaphore: value.compaction_l0_semaphore, l0_flush_delay_threshold: value.l0_flush_delay_threshold, l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index dfa89a765c..22ee560dbf 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2816,8 +2816,8 @@ where } use { - crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest, - utils::http::error::ApiError, + crate::tenant::gc_result::GcResult, http_utils::error::ApiError, + pageserver_api::models::TimelineGcRequest, }; #[cfg(test)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index bcba6d1f62..713efbb9a4 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -437,8 +437,7 @@ impl RemoteTimelineClient { .conf .remote_storage_config .as_ref() - .and_then(|r| r.concurrency_limit()) - .unwrap_or(0); + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); @@ -461,8 +460,7 @@ impl RemoteTimelineClient { .conf .remote_storage_config .as_ref() - .and_then(|r| r.concurrency_limit()) - .unwrap_or(0); + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; self.update_remote_physical_size_gauge(None); @@ -484,8 +482,7 @@ impl RemoteTimelineClient { .conf .remote_storage_config .as_ref() - .and_then(|r| r.concurrency_limit()) - .unwrap_or(0); + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; @@ -520,7 +517,7 @@ impl RemoteTimelineClient { if let Ok(queue) = queue_locked.initialized_mut() { let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); for d in blocked_deletions { - if let Err(e) = self.deletion_queue_client.push_layers_sync( + if let Err(e) = self.deletion_queue_client.push_layers( self.tenant_shard_id, self.timeline_id, self.generation, @@ -2154,7 +2151,6 @@ impl RemoteTimelineClient { self.generation, delete.layers.clone(), ) - .await .map_err(|e| anyhow::anyhow!(e)) } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3800852ccc..f9f843ef6b 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; use self::inmemory_layer::InMemoryLayerFileId; -use super::timeline::GetVectoredError; +use super::timeline::{GetVectoredError, ReadPath}; use super::PageReconstructError; pub fn range_overlaps(a: &Range, b: &Range) -> bool @@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState { pub(crate) io_concurrency: IoConcurrency, num_active_ios: Arc, + + pub(crate) read_path: Option, } /// The level of IO concurrency to be used on the read path @@ -609,6 +611,7 @@ impl ValuesReconstructState { delta_layers_visited: 0, io_concurrency, num_active_ios: Arc::new(AtomicUsize::new(0)), + read_path: None, } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 92313afba7..40282defd4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -353,7 +353,6 @@ impl Layer { /// while the guard exists. /// /// Returns None if the layer is currently evicted or becoming evicted. - #[cfg(test)] pub(crate) async fn keep_resident(&self) -> Option { let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; @@ -530,7 +529,6 @@ impl ResidentOrWantedEvicted { /// This is not used on the read path (anything that calls /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. - #[cfg(test)] fn get(&self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index d65f099182..029444e973 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -1,53 +1,83 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as compaction and GC +//! This module contains per-tenant background processes, e.g. compaction and GC. -use std::ops::ControlFlow; -use std::str::FromStr; +use std::cmp::max; +use std::future::Future; +use std::ops::{ControlFlow, RangeInclusive}; +use std::pin::pin; use std::sync::Arc; use std::time::{Duration, Instant}; +use once_cell::sync::Lazy; +use rand::Rng; +use scopeguard::defer; +use tokio::sync::{Semaphore, SemaphorePermit}; +use tokio_util::sync::CancellationToken; +use tracing::*; + use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::TENANT_TASK_EVENTS; -use crate::task_mgr; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS}; use crate::tenant::throttle::Stats; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; -use rand::Rng; -use tokio_util::sync::CancellationToken; -use tracing::*; -use utils::{backoff, completion, pausable_failpoint}; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; +use utils::backoff::exponential_backoff_duration; +use utils::completion::Barrier; +use utils::pausable_failpoint; -static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); - let permits = usize::max( - 1, - // while a lot of the work is done on spawn_blocking, we still do - // repartitioning in the async context. this should give leave us some workers - // unblocked to be blocked on other work, hopefully easing any outside visible - // effects of restarts. - // - // 6/8 is a guess; previously we ran with unlimited 8 and more from - // spawn_blocking. - (total_threads * 3).checked_div(4).unwrap_or(0), - ); - assert_ne!(permits, 0, "we will not be adding in permits later"); - assert!( - permits < total_threads, - "need threads avail for shorter work" - ); - tokio::sync::Semaphore::new(permits) - }); +/// Semaphore limiting concurrent background tasks (across all tenants). +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. +static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); -#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)] +/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if +/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled. +/// +/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive +/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less +/// important (e.g. due to page images in delta layers) and can wait for other background tasks. +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note +/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same +/// thread pool. +static CONCURRENT_L0_COMPACTION_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); + +/// Background jobs. +/// +/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that +/// do any significant IO or CPU work. +#[derive( + Debug, + PartialEq, + Eq, + Clone, + Copy, + strum_macros::IntoStaticStr, + strum_macros::Display, + enum_map::Enum, +)] #[strum(serialize_all = "snake_case")] pub(crate) enum BackgroundLoopKind { + /// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is + /// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics. + L0Compaction, Compaction, Gc, Eviction, - IngestHouseKeeping, + TenantHouseKeeping, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, @@ -55,36 +85,41 @@ pub(crate) enum BackgroundLoopKind { SecondaryDownload, } -impl BackgroundLoopKind { - fn as_static_str(&self) -> &'static str { - self.into() - } +pub struct BackgroundLoopSemaphorePermit<'a> { + _permit: SemaphorePermit<'static>, + _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>, } -/// Cancellation safe. -pub(crate) async fn concurrent_background_tasks_rate_limit_permit( +/// Acquires a semaphore permit, to limit concurrent background jobs. +pub(crate) async fn acquire_concurrency_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> tokio::sync::SemaphorePermit<'static> { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); +) -> BackgroundLoopSemaphorePermit<'static> { + let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind); if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { pausable_failpoint!("initial-size-calculation-permit-pause"); } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); - match CONCURRENT_BACKGROUND_TASKS.acquire().await { - Ok(permit) => permit, - Err(_closed) => unreachable!("we never close the semaphore"), + let semaphore = match loop_kind { + BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS, + _ => &CONCURRENT_BACKGROUND_TASKS, + }; + let permit = semaphore.acquire().await.expect("should never close"); + + recorder.acquired(); + + BackgroundLoopSemaphorePermit { + _permit: permit, + _recorder: recorder, } } -/// Start per tenant background loops: compaction and gc. -pub fn start_background_loops( - tenant: &Arc, - background_jobs_can_start: Option<&completion::Barrier>, -) { +/// Start per tenant background loops: compaction, GC, and ingest housekeeping. +pub fn start_background_loops(tenant: &Arc, can_start: Option<&Barrier>) { let tenant_shard_id = tenant.tenant_shard_id; + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -93,13 +128,15 @@ pub fn start_background_loops( &format!("compactor for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); compaction_loop(tenant, cancel) // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) @@ -108,6 +145,7 @@ pub fn start_background_loops( } }, ); + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, @@ -116,13 +154,15 @@ pub fn start_background_loops( &format!("garbage collector for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); gc_loop(tenant, cancel) .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; @@ -133,21 +173,23 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::IngestHousekeeping, + TaskKind::TenantHousekeeping, tenant_shard_id, None, - &format!("ingest housekeeping for tenant {tenant_shard_id}"), + &format!("housekeeping for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; - ingest_housekeeping_loop(tenant, cancel) - .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); + tenant_housekeeping_loop(tenant, cancel) + .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -155,372 +197,292 @@ pub fn start_background_loops( ); } -/// -/// Compaction task's main loop -/// +/// Compaction task's main loop. async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { + const BASE_BACKOFF_SECS: f64 = 1.0; const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10); - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); - let mut first = true; - loop { + let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut period = tenant.get_compaction_period(); + let mut error_run = 0; // consecutive errors + + // Stagger the compaction loop across tenants. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + if sleep_random(period, &cancel).await.is_err() { + return; + } + + loop { + // Recheck that we're still active. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Refresh the period. If compaction is disabled, check again in a bit. + period = tenant.get_compaction_period(); + if period == Duration::ZERO { + #[cfg(not(feature = "testing"))] + info!("automatic compaction is disabled"); tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, + _ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {}, + _ = cancel.cancelled() => return, } + continue; + } - let period = tenant.get_compaction_period(); + // Wait for the next compaction run. + let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + tokio::select! { + _ = tokio::time::sleep(backoff), if error_run > 0 => {}, + _ = tokio::time::sleep(period), if error_run == 0 => {}, + _ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {}, + _ = cancel.cancelled() => return, + } - // TODO: we shouldn't need to await to find tenant and this could be moved outside of - // loop, #3501. There are also additional "allowed_errors" in tests. - if first { - first = false; - if random_init_delay(period, &cancel).await.is_err() { - break; + // Run compaction. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + let IterationResult { output, elapsed } = iteration + .run(tenant.compaction_iteration(&cancel, &ctx)) + .await; + + match output { + Ok(outcome) => { + error_run = 0; + // If there's more compaction work, L0 or not, schedule an immediate run. + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(), + CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(), } } - let sleep_duration; - if period == Duration::ZERO { - #[cfg(not(feature = "testing"))] - info!("automatic compaction is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10) - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Compaction, - }; - - // Run compaction - let IterationResult { output, elapsed } = iteration - .run(tenant.compaction_iteration(&cancel, &ctx)) - .await; - match output { - Ok(outcome) => { - error_run_count = 0; - // schedule the next compaction immediately in case there is a pending compaction task - sleep_duration = if let CompactionOutcome::Pending = outcome { - Duration::from_secs(1) - } else { - period - }; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - sleep_duration = wait_duration; - } - } - - // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!( - elapsed_ms = elapsed.as_millis(), - "compaction iteration complete" - ); - }; - - // Perhaps we did no work and the walredo process has been idle for some time: - // give it a chance to shut down to avoid leaving walredo process running indefinitely. - // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off, - // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. - if let Some(walredo_mgr) = &tenant.walredo_mgr { - walredo_mgr.maybe_quiesce(period * 10); - } - - // Sleep - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { - break; + Err(err) => { + error_run += 1; + let backoff = + exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + log_compaction_error(&err, error_run, backoff, cancel.is_cancelled()); + continue; } } + + // NB: this log entry is recorded by performance tests. + debug!( + elapsed_ms = elapsed.as_millis(), + "compaction iteration complete" + ); } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } fn log_compaction_error( - e: &CompactionError, - error_run_count: u32, - sleep_duration: &std::time::Duration, + err: &CompactionError, + error_count: u32, + sleep_duration: Duration, task_cancelled: bool, ) { use crate::tenant::upload_queue::NotInitialized; use crate::tenant::PageReconstructError; use CompactionError::*; - enum LooksLike { - Info, - Error, - } + let level = match err { + ShuttingDown => return, + Offload(_) => Level::ERROR, + _ if task_cancelled => Level::INFO, + Other(err) => { + let root_cause = err.root_cause(); - let decision = match e { - ShuttingDown => None, - Offload(_) => Some(LooksLike::Error), - _ if task_cancelled => Some(LooksLike::Info), - Other(e) => { - let root_cause = e.root_cause(); - - let is_stopping = { - let upload_queue = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - let timeline = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - upload_queue || timeline - }; + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let is_stopping = upload_queue || timeline; if is_stopping { - Some(LooksLike::Info) + Level::INFO } else { - Some(LooksLike::Error) + Level::ERROR } } }; - match decision { - Some(LooksLike::Info) => info!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", - ), - Some(LooksLike::Error) => error!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", - ), - None => {} + match level { + Level::ERROR => { + error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + Level::INFO => { + info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + level => unimplemented!("unexpected level {level:?}"), } } -/// -/// GC task's main loop -/// +/// GC task's main loop. async fn gc_loop(tenant: Arc, cancel: CancellationToken) { const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + let mut error_run = 0; // consecutive errors - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - // GC might require downloading, to find the cutoff LSN that corresponds to the - // cutoff specified as time. - let ctx = - RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + // GC might require downloading, to find the cutoff LSN that corresponds to the + // cutoff specified as time. + let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; - let mut first = true; - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } - let period = tenant.get_gc_period(); + let period = tenant.get_gc_period(); - if first { - first = false; - - let delays = async { - random_init_delay(period, &cancel).await?; - Ok::<_, Cancelled>(()) - }; - - if delays.await.is_err() { - break; - } - } - - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration; - if period == Duration::ZERO || gc_horizon == 0 { - #[cfg(not(feature = "testing"))] - info!("automatic GC is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Gc, - }; - // Run gc - let IterationResult { output, elapsed: _ } = - iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) - .await; - match output { - Ok(_) => { - error_run_count = 0; - sleep_duration = period; - } - Err(crate::tenant::GcError::TenantCancelled) => { - return; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - - if matches!(e, crate::tenant::GcError::TimelineCancelled) { - // Timeline was cancelled during gc. We might either be in an event - // that affects the entire tenant (tenant deletion, pageserver shutdown), - // or in one that affects the timeline only (timeline deletion). - // Therefore, don't exit the loop. - info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } else { - error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } - - sleep_duration = wait_duration; - } - } - }; - - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { + if first { + first = false; + if sleep_random(period, &cancel).await.is_err() { break; } } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} - -async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let mut last_throttle_flag_reset_at = Instant::now(); - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } - - // We run ingest housekeeping with the same frequency as compaction: it is not worth - // having a distinct setting. But we don't run it in the same task, because compaction - // blocks on acquiring the background job semaphore. - let period = tenant.get_compaction_period(); - - // If compaction period is set to zero (to disable it), then we will use a reasonable default - let period = if period == Duration::ZERO { - humantime::Duration::from_str( - pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, - ) - .unwrap() - .into() - } else { - period - }; - - // Jitter the period by +/- 5% - let period = - rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); - - // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of - // a tenant, since it won't have started writing any ephemeral files yet. - if tokio::time::timeout(period, cancel.cancelled()) - .await - .is_ok() - { - break; - } + let gc_horizon = tenant.get_gc_horizon(); + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { + #[cfg(not(feature = "testing"))] + info!("automatic GC is disabled"); + // check again in 10 seconds, in case it's been enabled again. + sleep_duration = Duration::from_secs(10); + } else { let iteration = Iteration { started_at: Instant::now(), period, - kind: BackgroundLoopKind::IngestHouseKeeping, + kind: BackgroundLoopKind::Gc, }; - iteration.run(tenant.ingest_housekeeping()).await; - - // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. - // Or just spawn another background loop for this throttle, it's not like it's super costly. - info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { - let now = Instant::now(); - let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); - if count_throttled == 0 { + // Run gc + let IterationResult { output, elapsed: _ } = iteration + .run(tenant.gc_iteration( + None, + gc_horizon, + tenant.get_pitr_interval(), + &cancel, + &ctx, + )) + .await; + match output { + Ok(_) => { + error_run = 0; + sleep_duration = period; + } + Err(crate::tenant::GcError::TenantCancelled) => { return; } - let allowed_rps = tenant.pagestream_throttle.steady_rps(); - let delta = now - prev; - info!( - n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), - count_accounted = count_accounted_finish, // don't break existing log scraping - count_throttled, - sum_throttled_usecs, - count_accounted_start, // log after pre-existing fields to not break existing log scraping - allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds" - ); - }); - } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} + Err(e) => { + error_run += 1; + let wait_duration = + exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS); -async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { - // if the tenant has a proper status already, no need to wait for anything - if tenant.current_state() == TenantState::Active { - ControlFlow::Continue(()) - } else { - let mut tenant_state_updates = tenant.subscribe_for_state_updates(); - loop { - match tenant_state_updates.changed().await { - Ok(()) => { - let new_state = &*tenant_state_updates.borrow(); - match new_state { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => { - debug!("Not running the task loop, tenant is not active: {state:?}"); - continue; - } + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); } - } - Err(_sender_dropped_error) => { - return ControlFlow::Break(()); + + sleep_duration = wait_duration; } } + }; + + if tokio::time::timeout(sleep_duration, cancel.cancelled()) + .await + .is_ok() + { + break; + } + } +} + +/// Tenant housekeeping's main loop. +async fn tenant_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + let mut last_throttle_flag_reset_at = Instant::now(); + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Use the same period as compaction; it's not worth a separate setting. But if it's set to + // zero (to disable compaction), then use a reasonable default. Jitter it by 5%. + let period = match tenant.get_compaction_period() { + Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(), + period => period, + }; + + let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else { + break; + }; + + // Do tenant housekeeping. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::TenantHouseKeeping, + }; + iteration.run(tenant.housekeeping()).await; + + // Log any getpage throttling. + info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.pagestream_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), + count_accounted = count_accounted_finish, // don't break existing log scraping + count_throttled, + sum_throttled_usecs, + count_accounted_start, // log after pre-existing fields to not break existing log scraping + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds" + ); + }); + } +} + +/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down. +async fn wait_for_active_tenant( + tenant: &Arc, + cancel: &CancellationToken, +) -> ControlFlow<()> { + if tenant.current_state() == TenantState::Active { + return ControlFlow::Continue(()); + } + + let mut update_rx = tenant.subscribe_for_state_updates(); + loop { + tokio::select! { + _ = cancel.cancelled() => return ControlFlow::Break(()), + result = update_rx.changed() => if result.is_err() { + return ControlFlow::Break(()); + } + } + + match &*update_rx.borrow() { + TenantState::Active => { + debug!("Tenant state changed to active, continuing the task loop"); + return ControlFlow::Continue(()); + } + state => debug!("Not running the task loop, tenant is not active: {state:?}"), } } } @@ -529,26 +491,41 @@ async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { #[error("cancelled")] pub(crate) struct Cancelled; -/// Provide a random delay for background task initialization. +/// Sleeps for a random interval up to the given max value. /// /// This delay prevents a thundering herd of background tasks and will likely keep them running on /// different periods for more stable load. -pub(crate) async fn random_init_delay( - period: Duration, +pub(crate) async fn sleep_random( + max: Duration, cancel: &CancellationToken, -) -> Result<(), Cancelled> { - if period == Duration::ZERO { - return Ok(()); - } +) -> Result { + sleep_random_range(Duration::ZERO..=max, cancel).await +} - let d = { - let mut rng = rand::thread_rng(); - rng.gen_range(Duration::ZERO..=period) - }; - match tokio::time::timeout(d, cancel.cancelled()).await { - Ok(_) => Err(Cancelled), - Err(_) => Ok(()), +/// Sleeps for a random interval in the given range. Returns the duration. +pub(crate) async fn sleep_random_range( + interval: RangeInclusive, + cancel: &CancellationToken, +) -> Result { + let delay = rand::thread_rng().gen_range(interval); + if delay == Duration::ZERO { + return Ok(delay); } + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(delay) => Ok(delay), + } +} + +/// Sleeps for an interval with a random jitter. +pub(crate) async fn sleep_jitter( + duration: Duration, + jitter: Duration, + cancel: &CancellationToken, +) -> Result { + let from = duration.saturating_sub(jitter); + let to = duration.saturating_add(jitter); + sleep_random_range(from..=to, cancel).await } struct Iteration { @@ -564,42 +541,25 @@ struct IterationResult { impl Iteration { #[instrument(skip_all)] - pub(crate) async fn run(self, fut: Fut) -> IterationResult - where - Fut: std::future::Future, - { - let Self { - started_at, - period, - kind, - } = self; - - let mut fut = std::pin::pin!(fut); + pub(crate) async fn run, O>(self, fut: F) -> IterationResult { + let mut fut = pin!(fut); // Wrap `fut` into a future that logs a message every `period` so that we get a // very obvious breadcrumb in the logs _while_ a slow iteration is happening. - let liveness_logger = async move { - loop { - match tokio::time::timeout(period, &mut fut).await { - Ok(x) => return x, - Err(_) => { - // info level as per the same rationale why warn_when_period_overrun is info - // => https://github.com/neondatabase/neon/pull/5724 - info!("still running"); - } - } + let output = loop { + match tokio::time::timeout(self.period, &mut fut).await { + Ok(r) => break r, + Err(_) => info!("still running"), } }; - - let output = liveness_logger.await; - - let elapsed = started_at.elapsed(); - warn_when_period_overrun(elapsed, period, kind); + let elapsed = self.started_at.elapsed(); + warn_when_period_overrun(elapsed, self.period, self.kind); IterationResult { output, elapsed } } } -/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. + +// NB: the `task` and `period` are used for metrics labels. pub(crate) fn warn_when_period_overrun( elapsed: Duration, period: Duration, @@ -616,8 +576,8 @@ pub(crate) fn warn_when_period_overrun( ?task, "task iteration took longer than the configured period" ); - crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT - .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())]) + metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + .with_label_values(&[task.into(), &format!("{}", period.as_secs())]) .inc(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b6a349a209..aa71ccbbab 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -45,13 +45,12 @@ use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; use tokio::sync::mpsc::Sender; -use tokio::{ - runtime::Handle, - sync::{oneshot, watch}, -}; +use tokio::sync::{oneshot, watch, Notify}; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::critical; use utils::rate_limit::RateLimit; use utils::{ fs_ext, @@ -226,6 +225,7 @@ pub struct TimelineResources { pub remote_client: RemoteTimelineClient, pub pagestream_throttle: Arc, pub pagestream_throttle_metrics: Arc, + pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -425,6 +425,9 @@ pub struct Timeline { /// If true, the last compaction failed. compaction_failed: AtomicBool, + /// Notifies the tenant compaction loop that there is pending L0 compaction work. + l0_compaction_trigger: Arc, + /// Make sure we only have one running gc at a time. /// /// Must only be taken in two places: @@ -531,6 +534,9 @@ impl GcInfo { pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool { self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes) } + pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool { + self.leases.contains_key(&lsn) + } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this @@ -622,6 +628,71 @@ impl From for GetVectoredError { } } +/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes +/// only and not used by the "real read path". +pub enum ReadPathLayerId { + PersistentLayer(PersistentLayerKey), + InMemoryLayer(Range), +} + +impl std::fmt::Display for ReadPathLayerId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key), + ReadPathLayerId::InMemoryLayer(range) => { + write!(f, "in-mem {}..{}", range.start, range.end) + } + } + } +} +pub struct ReadPath { + keyspace: KeySpace, + lsn: Lsn, + path: Vec<(ReadPathLayerId, KeySpace, Range)>, +} + +impl ReadPath { + pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self { + Self { + keyspace, + lsn, + path: Vec::new(), + } + } + + pub fn record_layer_visit( + &mut self, + layer_to_read: &ReadableLayer, + keyspace_to_read: &KeySpace, + lsn_range: &Range, + ) { + let id = match layer_to_read { + ReadableLayer::PersistentLayer(layer) => { + ReadPathLayerId::PersistentLayer(layer.layer_desc().key()) + } + ReadableLayer::InMemoryLayer(layer) => { + ReadPathLayerId::InMemoryLayer(layer.get_lsn_range()) + } + }; + self.path + .push((id, keyspace_to_read.clone(), lsn_range.clone())); + } +} + +impl std::fmt::Display for ReadPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?; + for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() { + writeln!( + f, + "{}: {} {}..{} {}", + idx, layer_id, lsn_range.start, lsn_range.end, keyspace + )?; + } + Ok(()) + } +} + #[derive(thiserror::Error)] pub struct MissingKeyError { key: Key, @@ -629,6 +700,8 @@ pub struct MissingKeyError { cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, + /// Debug information about the read path if there's an error + read_path: Option, backtrace: Option, } @@ -645,10 +718,15 @@ impl std::fmt::Display for MissingKeyError { "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", self.key, self.shard, self.cont_lsn, self.request_lsn )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {}", ancestor_lsn)?; } + if let Some(ref read_path) = self.read_path { + write!(f, "\n{}", read_path)?; + } + if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -798,8 +876,12 @@ pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, ForceL0Compaction, + OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, + /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting + /// compaction via HTTP API. + NoYield, } #[serde_with::serde_as] @@ -1065,6 +1147,7 @@ impl Timeline { request_lsn: lsn, ancestor_lsn: None, backtrace: None, + read_path: None, })), } } @@ -1191,6 +1274,13 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { + let read_path = if self.conf.enable_read_path_debugging { + Some(ReadPath::new(keyspace.clone(), lsn)) + } else { + None + }; + reconstruct_state.read_path = read_path; + let traversal_res: Result<(), _> = self .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) .await; @@ -1467,6 +1557,7 @@ impl Timeline { let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE); let mut gc_info = self.gc_info.write().unwrap(); + let planned_cutoff = gc_info.min_cutoff(); let valid_until = SystemTime::now() + length; @@ -1487,7 +1578,7 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and + // Reject already GC-ed LSN if we are in AttachedSingle and // not blocked by the lsn lease deadline. let validate = { let conf = self.tenant_conf.load(); @@ -1498,7 +1589,10 @@ impl Timeline { if init || validate { let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + if lsn < planned_cutoff { + bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff); } } @@ -1700,35 +1794,48 @@ impl Timeline { .await } - /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending - /// compaction tasks. + /// Outermost timeline compaction operation; downloads needed layers. + /// + /// NB: the cancellation token is usually from a background task, but can also come from a + /// request task. pub(crate) async fn compact_with_options( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, ) -> Result { - // most likely the cancellation token is from background task, but in tests it could be the - // request task as well. + // Acquire the compaction lock and task semaphore. + // + // L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved + // out by other background tasks (including image compaction). We request this via + // `BackgroundLoopKind::L0Compaction`. + // + // If this is a regular compaction pass, and L0-only compaction is enabled in the config, + // then we should yield for immediate L0 compaction if necessary while we're waiting for the + // background task semaphore. There's no point yielding otherwise, since we'd just end up + // right back here. + let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); + let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { + true => BackgroundLoopKind::L0Compaction, + false => BackgroundLoopKind::Compaction, + }; + let yield_for_l0 = !is_l0_only + && self.get_compaction_l0_first() + && !options.flags.contains(CompactFlags::NoYield); - let prepare = async move { + let acquire = async move { let guard = self.compaction_lock.lock().await; - - let permit = super::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Compaction, - ctx, - ) - .await; - + let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await; (guard, permit) }; - // this wait probably never needs any "long time spent" logging, because we already nag if - // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { - tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done), - _ = cancel.cancelled() => return Ok(CompactionOutcome::Done), + (guard, permit) = acquire => (guard, permit), + _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => { + return Ok(CompactionOutcome::YieldForL0); + } + _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped), + _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped), }; let last_record_lsn = self.get_last_record_lsn(); @@ -1736,7 +1843,7 @@ impl Timeline { // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(CompactionOutcome::Done); + return Ok(CompactionOutcome::Skipped); } let result = match self.get_compaction_algorithm_settings().kind { @@ -2238,6 +2345,20 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) } + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + + pub fn get_compaction_l0_semaphore(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_semaphore + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore) + } + fn get_l0_flush_delay_threshold(&self) -> Option { // Disable L0 flushes by default. This and compaction needs further tuning. const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 @@ -2579,6 +2700,7 @@ impl Timeline { compaction_lock: tokio::sync::Mutex::default(), compaction_failed: AtomicBool::default(), + l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), standby_horizon: AtomicLsn::new(0), @@ -2628,7 +2750,7 @@ impl Timeline { return; } FlushLoopState::Exited => { - warn!( + info!( "ignoring attempt to restart exited flush_loop {}/{}", self.tenant_shard_id, self.timeline_id ); @@ -3052,7 +3174,7 @@ impl Timeline { let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { - let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + let wait_for_permit = super::tasks::acquire_concurrency_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, ); @@ -3498,6 +3620,7 @@ impl Timeline { request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, + read_path: std::mem::take(&mut reconstruct_state.read_path), })); } @@ -3616,6 +3739,9 @@ impl Timeline { } if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + if let Some(ref mut read_path) = reconstruct_state.read_path { + read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); + } let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( @@ -3916,6 +4042,12 @@ impl Timeline { } let flush_duration = flush_timer.stop_and_record(); + // Notify the tenant compaction loop if L0 compaction is needed. + let l0_count = *watch_l0.borrow(); + if l0_count >= self.get_compaction_threshold() { + self.l0_compaction_trigger.notify_one(); + } + // Delay the next flush to backpressure if compaction can't keep up. We delay by the // flush duration such that the flush takes 2x as long. This is propagated up to WAL // ingestion by having ephemeral layer rolls wait for flushes. @@ -4088,6 +4220,7 @@ impl Timeline { ImageLayerCreationMode::Initial, ctx, LastImageLayerCreationStatus::Initial, + false, // don't yield for L0, we're flushing L0 ) .await?; debug_assert!( @@ -4660,6 +4793,7 @@ impl Timeline { mode: ImageLayerCreationMode, ctx: &RequestContext, last_status: LastImageLayerCreationStatus, + yield_for_l0: bool, ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); @@ -4856,7 +4990,7 @@ impl Timeline { if let ImageLayerCreationMode::Try = mode { // We have at least made some progress - if batch_image_writer.pending_layer_num() >= 1 { + if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 { // The `Try` mode is currently only used on the compaction path. We want to avoid // image layer generation taking too long time and blocking L0 compaction. So in this // mode, we also inspect the current number of L0 layers and skip image layer generation @@ -5804,10 +5938,11 @@ impl Timeline { let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), - Err(walredo::Error::Other(e)) => { + Err(walredo::Error::Other(err)) => { + critical!("walredo failure during page reconstruction: {err:?}"); return Err(PageReconstructError::WalRedo( - e.context("reconstruct a page image"), - )) + err.context("reconstruct a page image"), + )); } }; Ok(img) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index cfde070442..5b915c50d3 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -10,8 +10,8 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, - LastImageLayerCreationStatus, RecordedDuration, Timeline, + CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, + ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::critical; use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; @@ -608,6 +609,11 @@ pub enum CompactionOutcome { /// Still has pending layers to be compacted after this round. Ideally, the scheduler /// should immediately schedule another compaction. Pending, + /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only + /// guaranteed when `compaction_l0_first` is enabled). + YieldForL0, + /// Compaction was skipped, because the timeline is ineligible for compaction. + Skipped, } impl Timeline { @@ -686,10 +692,25 @@ impl Timeline { // Define partitioning schema if needed + let l0_l1_boundary_lsn = { + // We do the repartition on the L0-L1 boundary. All data below the boundary + // are compacted by L0 with low read amplification, thus making the `repartition` + // function run fast. + let guard = self.layers.read().await; + let l0_min_lsn = guard + .layer_map()? + .level0_deltas() + .iter() + .map(|l| l.get_lsn_range().start) + .min() + .unwrap_or(self.get_disk_consistent_lsn()); + l0_min_lsn.max(self.get_ancestor_lsn()) + }; + // 1. L0 Compact - let l0_compaction_outcome = { + let l0_outcome = { let timer = self.metrics.compact_time_histo.start_timer(); - let l0_compaction_outcome = self + let l0_outcome = self .compact_level0( target_file_size, options.flags.contains(CompactFlags::ForceL0Compaction), @@ -697,83 +718,103 @@ impl Timeline { ) .await?; timer.stop_and_record(); - l0_compaction_outcome + l0_outcome }; - if let CompactionOutcome::Pending = l0_compaction_outcome { - // Yield and do not do any other kind of compaction. True means - // that we have pending L0 compaction tasks and the compaction scheduler - // will prioritize compacting this tenant/timeline again. - info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers."); - return Ok(CompactionOutcome::Pending); + if options.flags.contains(CompactFlags::OnlyL0Compaction) { + return Ok(l0_outcome); } - // 2. Repartition and create image layers if necessary - let partition_count = match self - .repartition( - self.get_last_record_lsn(), // TODO: use L0-L1 boundary - self.get_compaction_target_size(), - options.flags, - ctx, - ) - .await + // Yield if we have pending L0 compaction. The scheduler will do another pass. + if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) + && !options.flags.contains(CompactFlags::NoYield) { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); + info!("image/ancestor compaction yielding for L0 compaction"); + return Ok(CompactionOutcome::YieldForL0); + } - let mut partitioning = dense_partitioning; - partitioning - .parts - .extend(sparse_partitioning.into_dense().parts); + if l0_l1_boundary_lsn < self.partitioning.read().1 { + // We never go backwards when repartition and create image layers. + info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN."); + } else { + // 2. Repartition and create image layers if necessary + match self + .repartition( + l0_l1_boundary_lsn, + self.get_compaction_target_size(), + options.flags, + ctx, + ) + .await + { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); - // 3. Create new image layers for partitions that have been modified "enough". - let (image_layers, outcome) = self - .create_image_layers( - &partitioning, - lsn, - if options - .flags - .contains(CompactFlags::ForceImageLayerCreation) - { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - self.last_image_layer_creation_status - .load() - .as_ref() - .clone(), - ) - .await?; + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); - self.last_image_layer_creation_status - .store(Arc::new(outcome.clone())); + // 3. Create new image layers for partitions that have been modified "enough". + let (image_layers, outcome) = self + .create_image_layers( + &partitioning, + lsn, + if options + .flags + .contains(CompactFlags::ForceImageLayerCreation) + { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + self.last_image_layer_creation_status + .load() + .as_ref() + .clone(), + !options.flags.contains(CompactFlags::NoYield), + ) + .await + .inspect_err(|err| { + if let CreateImageLayersError::GetVectoredError( + GetVectoredError::MissingKey(_), + ) = err + { + critical!("missing key during compaction: {err:?}"); + } + })?; - self.upload_new_image_layers(image_layers)?; - if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { - // Yield and do not do any other kind of compaction. - info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); - return Ok(CompactionOutcome::Pending); + self.last_image_layer_creation_status + .store(Arc::new(outcome.clone())); + + self.upload_new_image_layers(image_layers)?; + if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { + // Yield and do not do any other kind of compaction. + info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + return Ok(CompactionOutcome::YieldForL0); + } } - partitioning.parts.len() - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { - tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() && !err.is_cancelled() { + tracing::error!( + "could not compact, repartitioning keyspace failed: {err:?}" + ); + } } - 1 - } - }; + }; + } + + let partition_count = self.partitioning.read().0 .0.parts.len(); // 4. Shard ancestor compaction @@ -2229,8 +2270,11 @@ impl Timeline { split_key_ranges.push((start, end)); } split_key_ranges.sort(); - let guard = self.layers.read().await; - let layer_map = guard.layer_map()?; + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; let mut current_start = None; let ranges_num = split_key_ranges.len(); for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() { @@ -2242,14 +2286,23 @@ impl Timeline { // We have already processed this partition. continue; } - let res = layer_map.range_search(start..end, compact_below_lsn); - let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::(); + let overlapping_layers = { + let mut desc = Vec::new(); + for layer in all_layers.iter() { + if overlaps_with(&layer.get_key_range(), &(start..end)) + && layer.get_lsn_range().start <= compact_below_lsn + { + desc.push(layer.clone()); + } + } + desc + }; + let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::(); if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 { // Try to extend the compaction range so that we include at least one full layer file. - let extended_end = res - .found - .keys() - .map(|layer| layer.layer.key_range.end) + let extended_end = overlapping_layers + .iter() + .map(|layer| layer.key_range.end) .min(); // It is possible that the search range does not contain any layer files when we reach the end of the loop. // In this case, we simply use the specified key range end. @@ -2276,7 +2329,6 @@ impl Timeline { current_start = Some(end); } } - drop(guard); Ok(compact_jobs) } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 3c828c8a9e..93b7efedb8 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -17,13 +17,11 @@ use crate::{ metadata::TimelineMetadata, remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, TimelineOrOffloaded, + TenantManifestError, Timeline, TimelineOrOffloaded, }, virtual_file::MaybeFatalIo, }; -use super::{Timeline, TimelineResources}; - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. @@ -296,12 +294,7 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. - TimelineResources { - remote_client, - pagestream_throttle: tenant.pagestream_throttle.clone(), - pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(), - l0_flush_global_state: tenant.l0_flush_global_state.clone(), - }, + tenant.get_timeline_resources_for(remote_client), // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, @@ -341,6 +334,13 @@ impl DeleteTimelineFlow { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); + // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest. + let Ok(tenant_guard) = tenant.gate.enter() else { + // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion. + info!("Tenant is shutting down, timeline deletion will be resumed when it next starts"); + return; + }; + task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, @@ -348,6 +348,8 @@ impl DeleteTimelineFlow { Some(timeline_id), "timeline_delete", async move { + let _guard = tenant_guard; + if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { // Only log as an error if it's not a cancellation. if matches!(err, DeleteTimelineError::Cancelled) { diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index f8bc4352e2..e0084d3eef 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -6,17 +6,20 @@ use crate::{ task_mgr::TaskKind, tenant::{ remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + storage_layer::{ + layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer, + }, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; use anyhow::Context; +use http_utils::error::ApiError; use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; +use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -351,18 +354,7 @@ pub(super) async fn prepare( // FIXME: the fsync should be mandatory, after both rewrites and copies if wrote_any { - let timeline_dir = VirtualFile::open( - &detached - .conf - .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), - ctx, - ) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); + fsync_timeline_dir(detached, ctx).await; } } @@ -376,7 +368,7 @@ pub(super) async fn prepare( tasks.spawn( async move { let _permit = limiter.acquire().await; - let owned = remote_copy( + let (owned, did_hardlink) = remote_copy( &adopted, &timeline, timeline.generation, @@ -384,16 +376,20 @@ pub(super) async fn prepare( &timeline.cancel, ) .await?; - tracing::info!(layer=%owned, "remote copied"); - Ok(owned) + tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied"); + Ok((owned, did_hardlink)) } .in_current_span(), ); } + let mut should_fsync = false; while let Some(res) = tasks.join_next().await { match res { - Ok(Ok(owned)) => { + Ok(Ok((owned, did_hardlink))) => { + if did_hardlink { + should_fsync = true; + } new_layers.push(owned); } Ok(Err(failed)) => { @@ -403,7 +399,10 @@ pub(super) async fn prepare( } } - // TODO: fsync directory again if we hardlinked something + // fsync directory again if we hardlinked something + if should_fsync { + fsync_timeline_dir(detached, ctx).await; + } let prepared = PreparedTimelineDetach { layers: new_layers }; @@ -629,35 +628,52 @@ async fn copy_lsn_prefix( } } -/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote -/// storage on successful return without the adopted layer being added to `index_part.json`. +/// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote +/// storage on successful return. without the adopted layer being added to `index_part.json`. +/// Returns (Layer, did hardlink) async fn remote_copy( adopted: &Layer, adoptee: &Arc, generation: Generation, shard_identity: ShardIdentity, cancel: &CancellationToken, -) -> Result { - // depending if Layer::keep_resident we could hardlink - +) -> Result<(Layer, bool), Error> { let mut metadata = adopted.metadata(); debug_assert!(metadata.generation <= generation); metadata.generation = generation; metadata.shard = shard_identity.shard_index(); - let owned = crate::tenant::storage_layer::Layer::for_evicted( - adoptee.conf, - adoptee, - adopted.layer_desc().layer_name(), - metadata, - ); + let conf = adoptee.conf; + let file_name = adopted.layer_desc().layer_name(); - adoptee + // depending if Layer::keep_resident, do a hardlink + let did_hardlink; + let owned = if let Some(adopted_resident) = adopted.keep_resident().await { + let adopted_path = adopted_resident.local_path(); + let adoptee_path = local_layer_path( + conf, + &adoptee.tenant_shard_id, + &adoptee.timeline_id, + &file_name, + &metadata.generation, + ); + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + did_hardlink = true; + Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() + } else { + did_hardlink = false; + Layer::for_evicted(conf, adoptee, file_name, metadata) + }; + + let layer = adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare)) + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok((layer, did_hardlink)) } pub(crate) enum DetachingAndReparenting { @@ -1001,3 +1017,16 @@ fn check_no_archived_children_of_ancestor( } Ok(()) } + +async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) { + let path = &timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id); + let timeline_dir = VirtualFile::open(&path, ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 26c2861b93..77c33349e0 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,8 +30,11 @@ use crate::{ pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint, - tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, + size::CalculateSyntheticSizeError, + storage_layer::LayerVisibilityHint, + tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit}, + timeline::EvictionError, + LogicalSizeCalculationCause, Tenant, }, }; @@ -80,8 +83,6 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, tenant: Arc) { - use crate::tenant::tasks::random_init_delay; - // acquire the gate guard only once within a useful span let Ok(guard) = self.gate.enter() else { return; @@ -94,7 +95,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &self.cancel).await.is_err() { + if sleep_random(period, &self.cancel).await.is_err() { return; } } @@ -330,11 +331,9 @@ impl Timeline { &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> { + let acquire_permit = + crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx); tokio::select! { permit = acquire_permit => ControlFlow::Continue(permit), @@ -374,7 +373,7 @@ impl Timeline { p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, - permit: tokio::sync::SemaphorePermit<'static>, + permit: BackgroundLoopSemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { if !self.tenant_shard_id.is_shard_zero() { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index de917377cb..23db4f88d2 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -39,7 +39,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; +use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. @@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection( .await .with_context(|| { format!("could not ingest record at {local_next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; uncommitted_records += 1; @@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection( .await .with_context(|| { format!("could not ingest record at {next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; if !ingested { tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 7e9477cfbc..bf30b92ea5 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -79,6 +79,14 @@ impl WalRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) // NB: The redo process is not trusted after we sent it the first // walredo work. Before that, it is trusted. Specifically, we trust // it to diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 01da61f84b..a61dc9f4c6 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); - chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); LWLockAcquire(lfc_lock, LW_SHARED); + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return 0; + } while (true) { - int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); - if (LFC_ENABLED()) - { - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - if (entry != NULL) + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) { - for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + if ((entry->bitmap[chunk_offs >> 5] & + ((uint32)1 << (chunk_offs & 31))) != 0) { - if ((entry->bitmap[chunk_offs >> 5] & - ((uint32)1 << (chunk_offs & 31))) != 0) - { - BITMAP_SET(bitmap, i); - found++; - } + BITMAP_SET(bitmap, i); + found++; } } - else - { - i += this_chunk; - } } else { - LWLockRelease(lfc_lock); - return found; + i += this_chunk; } /* * Break out of the iteration before doing expensive stuff for * a next iteration */ - if (i + 1 >= nblocks) + if (i >= nblocks) break; /* diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 012bd479bc..8051970176 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -3011,7 +3011,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block start_ts = GetCurrentTimestamp(); if (RecoveryInProgress() && MyBackendType != B_STARTUP) - XLogWaitForReplayOf(reqlsns[0].request_lsn); + XLogWaitForReplayOf(reqlsns->request_lsn); /* * Try to find prefetched page in the list of received pages. diff --git a/poetry.lock b/poetry.lock index c471d3e69c..fd200159b9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1030,52 +1030,56 @@ files = [ [[package]] name = "cryptography" -version = "43.0.1" +version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false -python-versions = ">=3.7" +python-versions = "!=3.9.0,!=3.9.1,>=3.7" groups = ["main"] files = [ - {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, - {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, - {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, - {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, - {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, - {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, - {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, + {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, + {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, + {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, + {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, + {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, + {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, + {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, ] [package.dependencies] cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] -nox = ["nox"] -pep8test = ["check-sdist", "click", "mypy", "ruff"] -sdist = ["build"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"] +docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] +pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d7880ea7b9..3aa6ac3a76 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -37,6 +37,7 @@ hex.workspace = true hmac.workspace = true hostname.workspace = true http.workspace = true +http-utils.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper0.workspace = true diff --git a/proxy/README.md b/proxy/README.md index ecd54fbbd8..1156bfd352 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation If both postgres and proxy are running you may send a SQL query: ```console -curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ - -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \ +curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \ + -H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \ -H 'Content-Type: application/json' \ --data '{ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", @@ -104,7 +104,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie ## Test proxy locally -Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`. +Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`. We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows: ```sh @@ -125,7 +125,7 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPER Let's create self-signed certificate by running: ```sh -openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build" ``` Then we need to build proxy with 'testing' feature and run, e.g.: @@ -136,5 +136,5 @@ RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backe Now from client you can start a new session: ```sh -PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full" +PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7ef096207a..dc595844c5 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -108,6 +108,10 @@ impl Backend<'_, T> { Self::Local(_) => panic!("Local backend has no API"), } } + + pub(crate) fn is_local_proxy(&self) -> bool { + matches!(self, Self::Local(_)) + } } impl<'a, T> Backend<'a, T> { diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 7a855bf54b..8f225dc1e0 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,416 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::{bail, ensure, Context}; -use camino::{Utf8Path, Utf8PathBuf}; -use compute_api::spec::LocalProxySpec; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; -use proxy::auth::{self}; -use proxy::cancellation::CancellationHandler; -use proxy::config::{ - self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, -}; -use proxy::control_plane::locks::ApiLocks; -use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; -use proxy::http::health_server::AppMetrics; -use proxy::intern::RoleNameInt; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::rate_limiter::{ - BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, -}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::{self, GlobalConnPoolOptions}; -use proxy::tls::client_config::compute_client_config_with_root_certs; -use proxy::types::RoleName; -use proxy::url::ApiUrl; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::Parser; -use thiserror::Error; -use tokio::net::TcpListener; -use tokio::sync::Notify; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use utils::sentry_init::init_sentry; -use utils::{pid_file, project_build_tag, project_git_version}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct LocalProxyCliArgs { - /// listen for incoming metrics connections on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - metrics: String, - /// listen for incoming http connections on ip:port - #[clap(long)] - http: String, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// User rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - user_rps_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Address of the postgres server - #[clap(long, default_value = "127.0.0.1:5432")] - postgres: SocketAddr, - /// Address of the compute-ctl api service - #[clap(long, default_value = "http://127.0.0.1:3080/")] - compute_ctl: ApiUrl, - /// Path of the local proxy config file - #[clap(long, default_value = "./local_proxy.json")] - config_path: Utf8PathBuf, - /// Path of the local proxy PID file - #[clap(long, default_value = "./local_proxy.pid")] - pid_path: Utf8PathBuf, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 200)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - #[clap(long, default_value_t = 100)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 16)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init_local_proxy()?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - // TODO: refactor these to use labels - debug!("Version: {GIT_VERSION}"); - debug!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = LocalProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - // before we bind to any ports, write the process ID to a file - // so that compute-ctl can find our process later - // in order to trigger the appropriate SIGHUP on config change. - // - // This also claims a "lock" that makes sure only one instance - // of local_proxy runs at a time. - let _process_guard = loop { - match pid_file::claim_for_current_process(&args.pid_path) { - Ok(guard) => break guard, - Err(e) => { - // compute-ctl might have tried to read the pid-file to let us - // know about some config change. We should try again. - error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - }; - - let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; - let http_listener = TcpListener::bind(args.http).await?; - let shutdown = CancellationToken::new(); - - // todo: should scale with CU - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { - rps: 10.0, - max: 100.0, - }, - 16, - )); - - let mut maintenance_tasks = JoinSet::new(); - - let refresh_config_notify = Arc::new(Notify::new()); - maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), { - let refresh_config_notify = Arc::clone(&refresh_config_notify); - move || { - refresh_config_notify.notify_one(); - } - })); - - // trigger the first config load **after** setting up the signal hook - // to avoid the race condition where: - // 1. No config file registered when local_proxy starts up - // 2. The config file is written but the signal hook is not yet received - // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. - refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); - - maintenance_tasks.spawn(proxy::http::health_server::task_main( - metrics_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - - let task = serverless::task_main( - config, - auth_backend, - http_listener, - shutdown.clone(), - Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), - endpoint_rate_limiter, - ); - - match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {}, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((res, _)) => res?, - } - - Ok(()) -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: false, - pool_options: GlobalConnPoolOptions { - gc_epoch: Duration::from_secs(60), - pool_shards: 2, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: false, - - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - - let compute_config = ComputeConfig { - retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, - tls: Arc::new(compute_client_config_with_root_certs()?), - timeout: Duration::from_secs(2), - }; - - Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, - metric_collection: None, - http_config, - authentication_config: AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool: ThreadPool::new(0), - scram_protocol_timeout: Duration::from_secs(10), - rate_limiter_enabled: false, - rate_limiter: BucketRateLimiter::new(vec![]), - rate_limit_ip_subnet: 64, - ip_allowlist_check_enabled: true, - is_vpc_acccess_proxy: false, - is_auth_broker: false, - accept_jwts: true, - console_redirect_confirmation_timeout: Duration::ZERO, - }, - proxy_protocol_v2: config::ProxyProtocolV2::Rejected, - handshake_timeout: Duration::from_secs(10), - region: "local".into(), - wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, - connect_compute_locks, - connect_to_compute: compute_config, - }))) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, ()>> { - let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.postgres, args.compute_ctl.clone()), - )); - - Ok(Box::leak(Box::new(auth_backend))) -} - -#[derive(Error, Debug)] -enum RefreshConfigError { - #[error(transparent)] - Read(#[from] std::io::Error), - #[error(transparent)] - Parse(#[from] serde_json::Error), - #[error(transparent)] - Validate(anyhow::Error), -} - -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { - let mut init = true; - loop { - rx.notified().await; - - match refresh_config_inner(&path).await { - Ok(()) => {} - // don't log for file not found errors if this is the first time we are checking - // for computes that don't use local_proxy, this is not an error. - Err(RefreshConfigError::Read(e)) - if init && e.kind() == std::io::ErrorKind::NotFound => - { - debug!(error=?e, ?path, "could not read config file"); - } - Err(e) => { - error!(error=?e, ?path, "could not read config file"); - } - } - - init = false; - } -} - -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { - let bytes = tokio::fs::read(&path).await?; - let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - - let mut jwks_set = vec![]; - - fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { - let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - - ensure!( - jwks_url.has_authority() - && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); - - ensure!( - jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); - - // clear username, password and ports - jwks_url - .set_username("") - .expect("url can be a base and has a valid host and is not a file. should not error"); - jwks_url - .set_password(None) - .expect("url can be a base and has a valid host and is not a file. should not error"); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks_url.set_fragment(None); - jwks_url.query_pairs_mut().clear().finish(); - - if jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - Ok(JwksSettings { - id: jwks.id, - jwks_url, - provider_name: jwks.provider_name, - jwt_audience: jwks.jwt_audience, - role_names: jwks - .role_names - .into_iter() - .map(RoleName::from) - .map(|s| RoleNameInt::from(&s)) - .collect(), - }) - } - - for jwks in data.jwks.into_iter().flatten() { - jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); - } - - info!("successfully loaded new config"); - JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); - - Ok(()) + proxy::binary::local_proxy::run().await } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 97d870a83a..0c3326af85 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -1,299 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. -use std::{net::SocketAddr, sync::Arc}; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::future::Either; -use futures::TryFutureExt; -use itertools::Itertools; -use proxy::context::RequestContext; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::protocol2::ConnectionInfo; -use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use proxy::stream::{PqStream, Stream}; -use proxy::tls::TlsServerEndPoint; -use rustls::crypto::ring; -use rustls::pki_types::PrivateKeyDer; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpListener; -use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; -use utils::project_git_version; -use utils::sentry_init::init_sentry; - -project_git_version!(GIT_VERSION); - -fn cli() -> clap::Command { - clap::Command::new("Neon proxy/router") - .version(GIT_VERSION) - .arg( - Arg::new("listen") - .short('l') - .long("listen") - .help("listen for incoming client connections on ip:port") - .default_value("127.0.0.1:4432"), - ) - .arg( - Arg::new("tls-key") - .short('k') - .long("tls-key") - .help("path to TLS key for client postgres connections") - .required(true), - ) - .arg( - Arg::new("tls-cert") - .short('c') - .long("tls-cert") - .help("path to TLS cert for client postgres connections") - .required(true), - ) - .arg( - Arg::new("dest") - .short('d') - .long("destination") - .help("append this domain zone to the SNI hostname to get the destination address") - .required(true), - ) -} +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - let args = cli().get_matches(); - let destination: String = args.get_one::("dest").unwrap().parse()?; - - // Configure TLS - let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( - args.get_one::("tls-key"), - args.get_one::("tls-cert"), - ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } - _ => bail!("tls-key and tls-cert must be specified"), - }; - - // Start listening for incoming client connections - let proxy_address: SocketAddr = args.get_one::("listen").unwrap().parse()?; - info!("Starting sni router on {proxy_address}"); - let proxy_listener = TcpListener::bind(proxy_address).await?; - - let cancellation_token = CancellationToken::new(); - - let main = tokio::spawn(task_main( - Arc::new(destination), - tls_config, - tls_server_end_point, - proxy_listener, - cancellation_token.clone(), - )); - let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {})); - - // the signal task cant ever succeed. - // the main task can error, or can succeed on cancellation. - // we want to immediately exit on either of these cases - let signal = match futures::future::select(signals_task, main).await { - Either::Left((res, _)) => proxy::error::flatten_err(res)?, - Either::Right((res, _)) => return proxy::error::flatten_err(res), - }; - - // maintenance tasks return `Infallible` success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match signal {} -} - -async fn task_main( - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - listener: tokio::net::TcpListener, - cancellation_token: CancellationToken, -) -> anyhow::Result<()> { - // When set for the server socket, the keepalive setting - // will be inherited by all accepted client sockets. - socket2::SockRef::from(&listener).set_keepalive(true)?; - - let connections = tokio_util::task::task_tracker::TaskTracker::new(); - - while let Some(accept_result) = - run_until_cancelled(listener.accept(), &cancellation_token).await - { - let (socket, peer_addr) = accept_result?; - - let session_id = uuid::Uuid::new_v4(); - let tls_config = Arc::clone(&tls_config); - let dest_suffix = Arc::clone(&dest_suffix); - - connections.spawn( - async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; - - info!(%peer_addr, "serving"); - let ctx = RequestContext::new( - session_id, - ConnectionInfo { - addr: peer_addr, - extra: None, - }, - proxy::metrics::Protocol::SniRouter, - "sni", - ); - handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await - } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }) - .instrument(tracing::info_span!("handle_client", ?session_id)), - ); - } - - connections.close(); - drop(listener); - - connections.wait().await; - - info!("all client connections have finished"); - Ok(()) -} - -const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; - -async fn ssl_handshake( - ctx: &RequestContext, - raw_stream: S, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, -) -> anyhow::Result> { - let mut stream = PqStream::new(Stream::from_raw(raw_stream)); - - let msg = stream.read_startup_packet().await?; - use pq_proto::FeStartupPacket::*; - - match msg { - SslRequest { direct: false } => { - stream - .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) - .await?; - - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empty. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - - Ok(Stream::Tls { - tls: Box::new( - raw.upgrade(tls_config, !ctx.has_private_peer_addr()) - .await?, - ), - tls_server_end_point, - }) - } - unexpected => { - info!( - ?unexpected, - "unexpected startup packet, rejecting connection" - ); - stream - .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) - .await? - } - } -} - -async fn handle_client( - ctx: RequestContext, - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - stream: impl AsyncRead + AsyncWrite + Unpin, -) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; - - // Cut off first part of the SNI domain - // We receive required destination details in the format of - // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` - let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; - let dest: Vec<&str> = sni - .split_once('.') - .context("invalid SNI")? - .0 - .splitn(3, "--") - .collect(); - let port = dest[2].parse::().context("invalid port")?; - let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); - - info!("destination: {}", destination); - - let mut client = tokio::net::TcpStream::connect(destination).await?; - - // doesn't yet matter as pg-sni-router doesn't report analytics logs - ctx.set_success(); - ctx.log_connect(); - - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - - match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { - Ok(_) => Ok(()), - Err(ErrorSource::Client(err)) => Err(err).context("client"), - Err(ErrorSource::Compute(err)) => Err(err).context("compute"), - } + proxy::binary::pg_sni_router::run().await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index de685a82c6..7d4b44841d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,831 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::bail; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use proxy::cancellation::{handle_cancel_messages, CancellationHandler}; -use proxy::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, - ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, -}; -use proxy::context::parquet::ParquetUploadArgs; -use proxy::http::health_server::AppMetrics; -use proxy::metrics::Metrics; -use proxy::rate_limiter::{ - EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, -}; -use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::kv_ops::RedisKVClient; -use proxy::redis::{elasticache, notifications}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::GlobalConnPoolOptions; -use proxy::tls::client_config::compute_client_config_with_root_certs; -use proxy::{auth, control_plane, http, serverless, usage_metrics}; -use remote_storage::RemoteStorageConfig; -use tokio::net::TcpListener; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::sentry_init::init_sentry; -use utils::{project_build_tag, project_git_version}; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::{Parser, ValueEnum}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -#[derive(Clone, Debug, ValueEnum)] -enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, - - #[value(name("link"), alias("control-redirect"))] - ConsoleRedirect, - - #[cfg(feature = "testing")] - Postgres, -} - -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct ProxyCliArgs { - /// Name of the region this proxy is deployed in - #[clap(long, default_value_t = String::new())] - region: String, - /// listen for incoming client connections on ip:port - #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, - #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] - auth_backend: AuthBackendType, - /// listen for management callback connection on ip:port - #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, - /// listen for incoming http connections (metrics, etc) on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - http: String, - /// listen for incoming wss connections on ip:port - #[clap(long)] - wss: Option, - /// redirect unauthenticated users to the given uri in case of console redirect auth - #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] - uri: String, - /// cloud API endpoint for authenticating users - #[clap( - short, - long, - default_value = "http://localhost:3000/authenticate_proxy_request/" - )] - auth_endpoint: String, - /// JWT used to connect to control plane. - #[clap( - long, - value_name = "JWT", - default_value = "", - env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" - )] - control_plane_token: Arc, - /// if this is not local proxy, this toggles whether we accept jwt or passwords for http - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_auth_broker: bool, - /// path to TLS key for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, - /// path to TLS cert for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, - /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. - #[clap(long, alias = "allow-ssl-keylogfile")] - allow_tls_keylogfile: bool, - /// path to directory with TLS certificates for client postgres connections - #[clap(long)] - certs_dir: Option, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// http endpoint to receive periodic metric updates - #[clap(long)] - metric_collection_endpoint: Option, - /// how often metrics should be sent to a collection endpoint - #[clap(long)] - metric_collection_interval: Option, - /// cache for `wake_compute` api method (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - wake_compute_cache: String, - /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] - wake_compute_lock: String, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// timeout for scram authentication protocol - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - scram_protocol_timeout: tokio::time::Duration, - /// size of the threadpool for password hashing - #[clap(long, default_value_t = 4)] - scram_thread_pool_size: u8, - /// Endpoint rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - endpoint_rps_limit: Vec, - /// Wake compute rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] - wake_compute_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] - redis_rps_limit: Vec, - /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] - cancellation_ch_size: usize, - /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - allowed_ips_cache: String, - /// cache for `role_secret` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - role_secret_cache: String, - /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) - #[clap(long)] - redis_notifications: Option, - /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". - #[clap(long, default_value = "irsa")] - redis_auth_type: String, - /// redis host for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_host: Option, - /// redis port for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_port: Option, - /// redis cluster name, used in aws elasticache - #[clap(long)] - redis_cluster_name: Option, - /// redis user_id, used in aws elasticache - #[clap(long)] - redis_user_id: Option, - /// aws region to retrieve credentials - #[clap(long, default_value_t = String::new())] - aws_region: String, - /// cache for `project_info` (use `size=0` to disable) - #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] - project_info_cache: String, - /// cache for all valid endpoints - #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] - endpoint_cache_config: String, - #[clap(flatten)] - parquet_upload: ParquetUploadArgs, - - /// interval for backup metric collection - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - metric_backup_collection_interval: std::time::Duration, - /// remote storage configuration for backup metric collection - /// Encoded as toml (same format as pageservers), eg - /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, value_parser = remote_storage_from_toml)] - metric_backup_collection_remote_storage: Option, - /// chunk size for backup metric collection - /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. - #[clap(long, default_value = "4194304")] - metric_backup_collection_chunk_size: usize, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Whether to retry the wake_compute request - #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] - wake_compute_retry: String, - - /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_private_access_proxy: bool, - - /// Configure whether all incoming requests have a Proxy Protocol V2 packet. - // TODO(conradludgate): switch default to rejected or required once we've updated all deployments - #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] - proxy_protocol_v2: ProxyProtocolV2, - - /// Time the proxy waits for the webauth session to be confirmed by the control plane. - // TODO: rename to `console_redirect_confirmation_timeout`. - #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] - webauth_confirmation_timeout: std::time::Duration, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// timeout for http connection requests - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20)] - sql_over_http_pool_max_conns_per_endpoint: usize, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20000)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - /// Duration each shard will wait on average before a GC sweep. - /// A longer time will causes sweeps to take longer but will interfere less frequently. - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - sql_over_http_pool_gc_epoch: tokio::time::Duration, - - /// How many shards should the global pool have. Must be a power of two. - /// More shards will introduce less contention for pool operations, but can - /// increase memory used by the pool - #[clap(long, default_value_t = 128)] - sql_over_http_pool_shards: usize, - - #[clap(long, default_value_t = 10000)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 64)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - // TODO: refactor these to use labels - info!("Version: {GIT_VERSION}"); - info!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = ProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - match auth_backend { - Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), - Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), - }; - info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => Some( - ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), - ), - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) - } else { - regional_redis_client.clone() - }; - - // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; - - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; - - let proxy_listener = if !args.is_auth_broker { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); - - Some(TcpListener::bind(proxy_address).await?) - } else { - None - }; - - // TODO: rename the argument to something like serverless. - // It now covers more than just websockets, it also covers SQL over HTTP. - let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; - info!("Starting wss on {serverless_address}"); - Some(TcpListener::bind(serverless_address).await?) - } else if args.is_auth_broker { - bail!("wss arg must be present for auth-broker") - } else { - None - }; - - let cancellation_token = CancellationToken::new(); - - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); - RateBucketInfo::validate(redis_rps_limit)?; - - let redis_kv_client = regional_redis_client - .as_ref() - .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - - // channel size should be higher than redis client limit to avoid blocking - let cancel_ch_size = args.cancellation_ch_size; - let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); - let cancellation_handler = Arc::new(CancellationHandler::new( - &config.connect_to_compute, - Some(tx_cancel), - )); - - // bit of a hack - find the min rps and max rps supported and turn it into - // leaky bucket config instead - let max = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .max_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.max); - let rps = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .min_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.rps); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { rps, max }, - 64, - )); - - // client facing tasks. these will exit on error or on cancellation - // cancellation returns Ok(()) - let mut client_tasks = JoinSet::new(); - match auth_backend { - Either::Left(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - } - Either::Right(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::console_redirect_proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - )); - } - } - } - - client_tasks.spawn(proxy::context::parquet::worker( - cancellation_token.clone(), - args.parquet_upload, - )); - - // maintenance tasks. these never return unless there's an error - let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {})); - maintenance_tasks.spawn(http::health_server::task_main( - http_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); - - if let Some(metrics_config) = &config.metric_collection { - // TODO: Add gc regardles of the metric collection being enabled. - maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); - } - - if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { - if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } - - if let Some(mut redis_kv_client) = redis_kv_client { - maintenance_tasks.spawn(async move { - redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await - }); - } - - if let Some(regional_redis_client) = regional_redis_client { - let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; - let span = tracing::info_span!("endpoints_cache"); - maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } - .instrument(span), - ); - } - } - } - - let maintenance = loop { - // get one complete task - match futures::future::select( - pin!(maintenance_tasks.join_next()), - pin!(client_tasks.join_next()), - ) - .await - { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?, - // exit if all our client tasks have shutdown gracefully - Either::Right((None, _)) => return Ok(()), - } - }; - - // maintenance tasks return Infallible success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match maintenance {} -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let thread_pool = ThreadPool::new(args.scram_thread_pool_size); - Metrics::install(thread_pool.metrics.clone()); - - let tls_config = match (&args.tls_key, &args.tls_cert) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls( - key_path, - cert_path, - args.certs_dir.as_ref(), - args.allow_tls_keylogfile, - )?), - (None, None) => None, - _ => bail!("either both or neither tls-key and tls-cert must be specified"), - }; - - let backup_metric_collection_config = config::MetricBackupCollectionConfig { - interval: args.metric_backup_collection_interval, - remote_storage_config: args.metric_backup_collection_remote_storage.clone(), - chunk_size: args.metric_backup_collection_chunk_size, - }; - - let metric_collection = match ( - &args.metric_collection_endpoint, - &args.metric_collection_interval, - ) { - (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { - endpoint: endpoint.parse()?, - interval: humantime::parse_duration(interval)?, - backup_metric_collection_config, - }), - (None, None) => None, - _ => bail!( - "either both or neither metric-collection-endpoint \ - and metric-collection-interval must be specified" - ), - }; - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = control_plane::locks::ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: !args.is_auth_broker, - pool_options: GlobalConnPoolOptions { - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, - gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, - pool_shards: args.sql_over_http.sql_over_http_pool_shards, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: args.sql_over_http.sql_over_http_pool_opt_in, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - let authentication_config = AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool, - scram_protocol_timeout: args.scram_protocol_timeout, - rate_limiter_enabled: args.auth_rate_limit_enabled, - rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), - rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, - ip_allowlist_check_enabled: !args.is_private_access_proxy, - is_vpc_acccess_proxy: args.is_private_access_proxy, - is_auth_broker: args.is_auth_broker, - accept_jwts: args.is_auth_broker, - console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, - }; - - let compute_config = ComputeConfig { - retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, - tls: Arc::new(compute_client_config_with_root_certs()?), - timeout: Duration::from_secs(2), - }; - - let config = ProxyConfig { - tls_config, - metric_collection, - http_config, - authentication_config, - proxy_protocol_v2: args.proxy_protocol_v2, - handshake_timeout: args.handshake_timeout, - region: args.region.clone(), - wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, - connect_compute_locks, - connect_to_compute: compute_config, - }; - - let config = Box::leak(Box::new(config)); - - tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); - - Ok(config) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &ProxyCliArgs, -) -> anyhow::Result, &'static ConsoleRedirectBackend>> { - match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - tokio::spawn(locks.garbage_collect_worker()); - - let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - - let endpoint = http::Endpoint::new(url, http::new_client()); - - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - - let api = control_plane::client::ControlPlaneClient::ProxyV1(api); - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - #[cfg(feature = "testing")] - AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = control_plane::client::mock::MockControlPlane::new( - url, - !args.is_private_access_proxy, - ); - let api = control_plane::client::ControlPlaneClient::PostgresMock(api); - - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - AuthBackendType::ConsoleRedirect => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - - let url = args.uri.clone().parse()?; - let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(ep_url, http::new_client()); - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter - // and locks are not used in ConsoleRedirectBackend, - // but they are required by the NeonControlPlaneClient - let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - - let backend = ConsoleRedirectBackend::new(url, api); - let config = Box::leak(Box::new(backend)); - - Ok(Either::Right(config)) - } - } -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use clap::Parser; - use proxy::rate_limiter::RateBucketInfo; - - #[test] - fn parse_endpoint_rps_limit() { - let config = super::ProxyCliArgs::parse_from([ - "proxy", - "--endpoint-rps-limit", - "100@1s", - "--endpoint-rps-limit", - "20@30s", - ]); - - assert_eq!( - config.endpoint_rps_limit, - vec![ - RateBucketInfo::new(100, Duration::from_secs(1)), - RateBucketInfo::new(20, Duration::from_secs(30)), - ] - ); - } + proxy::binary::proxy::run().await } diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs new file mode 100644 index 0000000000..e0d8515375 --- /dev/null +++ b/proxy/src/binary/local_proxy.rs @@ -0,0 +1,410 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use crate::auth::{self}; +use crate::cancellation::CancellationHandler; +use crate::config::{ + self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, +}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::http::health_server::AppMetrics; +use crate::intern::RoleNameInt; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, +}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::{self, GlobalConnPoolOptions}; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::types::RoleName; +use crate::url::ApiUrl; +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use compute_api::spec::LocalProxySpec; +use futures::future::Either; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::Parser; +use thiserror::Error; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct LocalProxyCliArgs { + /// listen for incoming metrics connections on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + metrics: String, + /// listen for incoming http connections on ip:port + #[clap(long)] + http: String, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// User rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + user_rps_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Address of the postgres server + #[clap(long, default_value = "127.0.0.1:5432")] + postgres: SocketAddr, + /// Address of the internal compute-ctl api service + #[clap(long, default_value = "http://127.0.0.1:3081/")] + compute_ctl: ApiUrl, + /// Path of the local proxy config file + #[clap(long, default_value = "./local_proxy.json")] + config_path: Utf8PathBuf, + /// Path of the local proxy PID file + #[clap(long, default_value = "./local_proxy.pid")] + pid_path: Utf8PathBuf, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 200)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + #[clap(long, default_value_t = 100)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 16)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init_local_proxy()?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + // TODO: refactor these to use labels + debug!("Version: {GIT_VERSION}"); + debug!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = LocalProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args); + + // before we bind to any ports, write the process ID to a file + // so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + // + // This also claims a "lock" that makes sure only one instance + // of local_proxy runs at a time. + let _process_guard = loop { + match pid_file::claim_for_current_process(&args.pid_path) { + Ok(guard) => break guard, + Err(e) => { + // compute-ctl might have tried to read the pid-file to let us + // know about some config change. We should try again. + error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; + + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; + let http_listener = TcpListener::bind(args.http).await?; + let shutdown = CancellationToken::new(); + + // todo: should scale with CU + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { + rps: 10.0, + max: 100.0, + }, + 16, + )); + + let mut maintenance_tasks = JoinSet::new(); + + let refresh_config_notify = Arc::new(Notify::new()); + maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), { + let refresh_config_notify = Arc::clone(&refresh_config_notify); + move || { + refresh_config_notify.notify_one(); + } + })); + + // trigger the first config load **after** setting up the signal hook + // to avoid the race condition where: + // 1. No config file registered when local_proxy starts up + // 2. The config file is written but the signal hook is not yet received + // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + + maintenance_tasks.spawn(crate::http::health_server::task_main( + metrics_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + + let task = serverless::task_main( + config, + auth_backend, + http_listener, + shutdown.clone(), + Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), + endpoint_rate_limiter, + ); + + match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {}, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((res, _)) => res?, + } + + Ok(()) +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + gc_epoch: Duration::from_secs(60), + pool_shards: 2, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: false, + + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + + let compute_config = ComputeConfig { + retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + Ok(Box::leak(Box::new(ProxyConfig { + tls_config: None, + metric_collection: None, + http_config, + authentication_config: AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool: ThreadPool::new(0), + scram_protocol_timeout: Duration::from_secs(10), + rate_limiter_enabled: false, + rate_limiter: BucketRateLimiter::new(vec![]), + rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, + is_auth_broker: false, + accept_jwts: true, + console_redirect_confirmation_timeout: Duration::ZERO, + }, + proxy_protocol_v2: config::ProxyProtocolV2::Rejected, + handshake_timeout: Duration::from_secs(10), + region: "local".into(), + wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, + connect_compute_locks, + connect_to_compute: compute_config, + }))) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> { + let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.postgres, args.compute_ctl.clone()), + )); + + Box::leak(Box::new(auth_backend)) +} + +#[derive(Error, Debug)] +enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), +} + +async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + let mut init = true; + loop { + rx.notified().await; + + match refresh_config_inner(&path).await { + Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } + + init = false; + } +} + +async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { + let bytes = tokio::fs::read(&path).await?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; + + let mut jwks_set = vec![]; + + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; + + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + Ok(JwksSettings { + id: jwks.id, + jwks_url, + _provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) + } + + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + + Ok(()) +} diff --git a/proxy/src/binary/mod.rs b/proxy/src/binary/mod.rs new file mode 100644 index 0000000000..dc07d3e675 --- /dev/null +++ b/proxy/src/binary/mod.rs @@ -0,0 +1,7 @@ +//! All binaries have the body of their main() defined here, so that the code +//! is also covered by code style configs in lib.rs and the unused-code check is +//! more effective when practically all modules are private to the lib. + +pub mod local_proxy; +pub mod pg_sni_router; +pub mod proxy; diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs new file mode 100644 index 0000000000..235e9674c6 --- /dev/null +++ b/proxy/src/binary/pg_sni_router.rs @@ -0,0 +1,304 @@ +/// A stand-alone program that routes connections, e.g. from +/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +/// +/// This allows connecting to pods/services running in the same Kubernetes cluster from +/// the outside. Similar to an ingress controller for HTTPS. +use std::{net::SocketAddr, sync::Arc}; + +use crate::context::RequestContext; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::protocol2::ConnectionInfo; +use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; +use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; +use futures::future::Either; +use futures::TryFutureExt; +use itertools::Itertools; +use rustls::crypto::ring; +use rustls::pki_types::PrivateKeyDer; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; + +project_git_version!(GIT_VERSION); + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .version(GIT_VERSION) + .arg( + Arg::new("listen") + .short('l') + .long("listen") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .help("path to TLS key for client postgres connections") + .required(true), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .help("path to TLS cert for client postgres connections") + .required(true), + ) + .arg( + Arg::new("dest") + .short('d') + .long("destination") + .help("append this domain zone to the SNI hostname to get the destination address") + .required(true), + ) +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + let args = cli().get_matches(); + let destination: String = args + .get_one::("dest") + .expect("string argument defined") + .parse()?; + + // Configure TLS + let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( + args.get_one::("tls-key"), + args.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + (tls_config, tls_server_end_point) + } + _ => bail!("tls-key and tls-cert must be specified"), + }; + + // Start listening for incoming client connections + let proxy_address: SocketAddr = args + .get_one::("listen") + .expect("string argument defined") + .parse()?; + info!("Starting sni router on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let cancellation_token = CancellationToken::new(); + + let main = tokio::spawn(task_main( + Arc::new(destination), + tls_config, + tls_server_end_point, + proxy_listener, + cancellation_token.clone(), + )); + let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {})); + + // the signal task cant ever succeed. + // the main task can error, or can succeed on cancellation. + // we want to immediately exit on either of these cases + let signal = match futures::future::select(signals_task, main).await { + Either::Left((res, _)) => crate::error::flatten_err(res)?, + Either::Right((res, _)) => return crate::error::flatten_err(res), + }; + + // maintenance tasks return `Infallible` success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match signal {} +} + +async fn task_main( + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); + + connections.spawn( + async move { + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + info!(%peer_addr, "serving"); + let ctx = RequestContext::new( + session_id, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + crate::metrics::Protocol::SniRouter, + "sni", + ); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }) + .instrument(tracing::info_span!("handle_client", ?session_id)), + ); + } + + connections.close(); + drop(listener); + + connections.wait().await; + + info!("all client connections have finished"); + Ok(()) +} + +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +async fn ssl_handshake( + ctx: &RequestContext, + raw_stream: S, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, +) -> anyhow::Result> { + let mut stream = PqStream::new(Stream::from_raw(raw_stream)); + + let msg = stream.read_startup_packet().await?; + use pq_proto::FeStartupPacket::SslRequest; + + match msg { + SslRequest { direct: false } => { + stream + .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) + .await?; + + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empty. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + + Ok(Stream::Tls { + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), + tls_server_end_point, + }) + } + unexpected => { + info!( + ?unexpected, + "unexpected startup packet, rejecting connection" + ); + stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await? + } + } +} + +async fn handle_client( + ctx: RequestContext, + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + stream: impl AsyncRead + AsyncWrite + Unpin, +) -> anyhow::Result<()> { + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; + + // Cut off first part of the SNI domain + // We receive required destination details in the format of + // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` + let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; + let dest: Vec<&str> = sni + .split_once('.') + .context("invalid SNI")? + .0 + .splitn(3, "--") + .collect(); + let port = dest[2].parse::().context("invalid port")?; + let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); + + info!("destination: {}", destination); + + let mut client = tokio::net::TcpStream::connect(destination).await?; + + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log_connect(); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + + match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + Ok(_) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } +} diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs new file mode 100644 index 0000000000..e38c49ca10 --- /dev/null +++ b/proxy/src/binary/proxy.rs @@ -0,0 +1,827 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use crate::cancellation::{handle_cancel_messages, CancellationHandler}; +use crate::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; +use crate::context::parquet::ParquetUploadArgs; +use crate::http::health_server::AppMetrics; +use crate::metrics::Metrics; +use crate::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::redis::kv_ops::RedisKVClient; +use crate::redis::{elasticache, notifications}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::{auth, control_plane, http, serverless, usage_metrics}; +use anyhow::bail; +use futures::future::Either; +use remote_storage::RemoteStorageConfig; +use tokio::net::TcpListener; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::{Parser, ValueEnum}; + +#[derive(Clone, Debug, ValueEnum)] +enum AuthBackendType { + #[value(name("cplane-v1"), alias("control-plane"))] + ControlPlaneV1, + + #[value(name("link"), alias("control-redirect"))] + ConsoleRedirect, + + #[cfg(any(test, feature = "testing"))] + Postgres, +} + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct ProxyCliArgs { + /// Name of the region this proxy is deployed in + #[clap(long, default_value_t = String::new())] + region: String, + /// listen for incoming client connections on ip:port + #[clap(short, long, default_value = "127.0.0.1:4432")] + proxy: String, + #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] + auth_backend: AuthBackendType, + /// listen for management callback connection on ip:port + #[clap(short, long, default_value = "127.0.0.1:7000")] + mgmt: String, + /// listen for incoming http connections (metrics, etc) on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + http: String, + /// listen for incoming wss connections on ip:port + #[clap(long)] + wss: Option, + /// redirect unauthenticated users to the given uri in case of console redirect auth + #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] + uri: String, + /// cloud API endpoint for authenticating users + #[clap( + short, + long, + default_value = "http://localhost:3000/authenticate_proxy_request/" + )] + auth_endpoint: String, + /// JWT used to connect to control plane. + #[clap( + long, + value_name = "JWT", + default_value = "", + env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" + )] + control_plane_token: Arc, + /// if this is not local proxy, this toggles whether we accept jwt or passwords for http + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_auth_broker: bool, + /// path to TLS key for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'k', long, alias = "ssl-key")] + tls_key: Option, + /// path to TLS cert for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'c', long, alias = "ssl-cert")] + tls_cert: Option, + /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. + #[clap(long, alias = "allow-ssl-keylogfile")] + allow_tls_keylogfile: bool, + /// path to directory with TLS certificates for client postgres connections + #[clap(long)] + certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, + /// cache for `wake_compute` api method (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + wake_compute_cache: String, + /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// timeout for scram authentication protocol + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, + /// Endpoint rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] + redis_rps_limit: Vec, + /// Cancellation channel size (max queue size for redis kv client) + #[clap(long, default_value = "1024")] + cancellation_ch_size: usize, + /// cache for `allowed_ips` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + allowed_ips_cache: String, + /// cache for `role_secret` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + role_secret_cache: String, + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) + #[clap(long)] + redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, + /// cache for `project_info` (use `size=0` to disable) + #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] + project_info_cache: String, + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, + #[clap(flatten)] + parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, + + /// Configure whether all incoming requests have a Proxy Protocol V2 packet. + // TODO(conradludgate): switch default to rejected or required once we've updated all deployments + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + proxy_protocol_v2: ProxyProtocolV2, + + /// Time the proxy waits for the webauth session to be confirmed by the control plane. + // TODO: rename to `console_redirect_confirmation_timeout`. + #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] + webauth_confirmation_timeout: std::time::Duration, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + // TODO: refactor these to use labels + info!("Version: {GIT_VERSION}"); + info!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = ProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; + + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + }; + info!("Using region: {}", args.aws_region); + + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), + ), + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache::CredentialsProvider::new( + args.aws_region, + args.redis_cluster_name, + args.redis_user_id, + ) + .await, + ), + ), + (None, None) => { + warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; + + // Check that we can bind to address before further initialization + let http_address: SocketAddr = args.http.parse()?; + info!("Starting http on {http_address}"); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; + + let mgmt_address: SocketAddr = args.mgmt.parse()?; + info!("Starting mgmt on {mgmt_address}"); + let mgmt_listener = TcpListener::bind(mgmt_address).await?; + + let proxy_listener = if args.is_auth_broker { + None + } else { + let proxy_address: SocketAddr = args.proxy.parse()?; + info!("Starting proxy on {proxy_address}"); + + Some(TcpListener::bind(proxy_address).await?) + }; + + // TODO: rename the argument to something like serverless. + // It now covers more than just websockets, it also covers SQL over HTTP. + let serverless_listener = if let Some(serverless_address) = args.wss { + let serverless_address: SocketAddr = serverless_address.parse()?; + info!("Starting wss on {serverless_address}"); + Some(TcpListener::bind(serverless_address).await?) + } else if args.is_auth_broker { + bail!("wss arg must be present for auth-broker") + } else { + None + }; + + let cancellation_token = CancellationToken::new(); + + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + + let redis_kv_client = regional_redis_client + .as_ref() + .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); + + // channel size should be higher than redis client limit to avoid blocking + let cancel_ch_size = args.cancellation_ch_size; + let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); + let cancellation_handler = Arc::new(CancellationHandler::new( + &config.connect_to_compute, + Some(tx_cancel), + )); + + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); + + // client facing tasks. these will exit on error or on cancellation + // cancellation returns Ok(()) + let mut client_tasks = JoinSet::new(); + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } + } + + client_tasks.spawn(crate::context::parquet::worker( + cancellation_token.clone(), + args.parquet_upload, + )); + + // maintenance tasks. these never return unless there's an error + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {})); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); + + if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. + maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + } + + #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] + if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { + if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + + if let Some(mut redis_kv_client) = redis_kv_client { + maintenance_tasks.spawn(async move { + redis_kv_client.try_connect().await?; + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + }); + } + + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } + } + } + + let maintenance = loop { + // get one complete task + match futures::future::select( + pin!(maintenance_tasks.join_next()), + pin!(client_tasks.join_next()), + ) + .await + { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((Some(res), _)) => crate::error::flatten_err(res)?, + // exit if all our client tasks have shutdown gracefully + Either::Right((None, _)) => return Ok(()), + } + }; + + // maintenance tasks return Infallible success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match maintenance {} +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + + let tls_config = match (&args.tls_key, &args.tls_cert) { + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.certs_dir.as_ref(), + args.allow_tls_keylogfile, + )?), + (None, None) => None, + _ => bail!("either both or neither tls-key and tls-cert must be specified"), + }; + + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), + chunk_size: args.metric_backup_collection_chunk_size, + }; + + let metric_collection = match ( + &args.metric_collection_endpoint, + &args.metric_collection_interval, + ) { + (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { + endpoint: endpoint.parse()?, + interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, + }), + (None, None) => None, + _ => bail!( + "either both or neither metric-collection-endpoint \ + and metric-collection-interval must be specified" + ), + }; + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = control_plane::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: !args.is_auth_broker, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool, + scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_vpc_acccess_proxy: args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, + console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, + }; + + let compute_config = ComputeConfig { + retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + let config = ProxyConfig { + tls_config, + metric_collection, + http_config, + authentication_config, + proxy_protocol_v2: args.proxy_protocol_v2, + handshake_timeout: args.handshake_timeout, + region: args.region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute: compute_config, + }; + + let config = Box::leak(Box::new(config)); + + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + + Ok(config) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &ProxyCliArgs, +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { + AuthBackendType::ControlPlaneV1 => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + tokio::spawn(locks.garbage_collect_worker()); + + let url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let api = control_plane::client::ControlPlaneClient::ProxyV1(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = control_plane::client::mock::MockControlPlane::new( + url, + !args.is_private_access_proxy, + ); + let api = control_plane::client::ControlPlaneClient::PostgresMock(api); + + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::ConsoleRedirect => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + + let url = args.uri.clone().parse()?; + let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(ep_url, http::new_client()); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter + // and locks are not used in ConsoleRedirectBackend, + // but they are required by the NeonControlPlaneClient + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let backend = ConsoleRedirectBackend::new(url, api); + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use crate::rate_limiter::RateBucketInfo; + use clap::Parser; + + #[test] + fn parse_endpoint_rps_limit() { + let config = super::ProxyCliArgs::parse_from([ + "proxy", + "--endpoint-rps-limit", + "100@1s", + "--endpoint-rps-limit", + "20@30s", + ]); + + assert_eq!( + config.endpoint_rps_limit, + vec![ + RateBucketInfo::new(100, Duration::from_secs(1)), + RateBucketInfo::new(20, Duration::from_secs(30)), + ] + ); + } +} diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 4d919f374a..e84f1676e2 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -69,17 +69,35 @@ pub async fn handle_cancel_messages( value, resp_tx, _guard, - expire: _, + expire, } => { + let res = client.hset(&key, field, value).await; if let Some(resp_tx) = resp_tx { - resp_tx - .send(client.hset(key, field, value).await) - .inspect_err(|e| { - tracing::debug!("failed to send StoreCancelKey response: {:?}", e); - }) - .ok(); + if res.is_ok() { + resp_tx + .send(client.expire(key, expire).await) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } else { + resp_tx + .send(res) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } + } else if res.is_ok() { + drop(client.expire(key, expire).await); } else { - drop(client.hset(key, field, value).await); + tracing::warn!("failed to store cancel key: {:?}", res); } } CancelKeyOp::GetCancelData { @@ -436,7 +454,7 @@ impl Session { &self.key } - // Send the store key op to the cancellation handler + // Send the store key op to the cancellation handler and set TTL for the key pub(crate) async fn write_cancel_key( &self, cancel_closure: CancelClosure, diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs index 60fdf107d4..ab3179afb2 100644 --- a/proxy/src/compute_ctl/mod.rs +++ b/proxy/src/compute_ctl/mod.rs @@ -42,14 +42,14 @@ pub enum Privilege { #[derive(Error, Debug)] pub enum ComputeCtlError { #[error("connection error: {0}")] - ConnectionError(#[source] reqwest_middleware::Error), + Connection(#[source] reqwest_middleware::Error), #[error("request error [{status}]: {body:?}")] - RequestError { + Request { status: StatusCode, body: Option, }, #[error("response parsing error: {0}")] - ResponseError(#[source] reqwest::Error), + Response(#[source] reqwest::Error), } impl ComputeCtlApi { @@ -89,14 +89,14 @@ impl ComputeCtlApi { .json(req) .send() .await - .map_err(ComputeCtlError::ConnectionError)?; + .map_err(ComputeCtlError::Connection)?; let status = resp.status(); if status.is_client_error() || status.is_server_error() { let body = resp.json().await.ok(); - return Err(ComputeCtlError::RequestError { status, body }); + return Err(ComputeCtlError::Request { status, body }); } - resp.json().await.map_err(ComputeCtlError::ResponseError) + resp.json().await.map_err(ComputeCtlError::Response) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1dcd37712e..460e0cff54 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -151,7 +151,6 @@ impl FromStr for EndpointCacheConfig { } #[derive(Debug)] pub struct MetricBackupCollectionConfig { - pub interval: Duration, pub remote_storage_config: Option, pub chunk_size: usize, } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 4f1dd39d92..0537ae6a62 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -187,6 +187,10 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); + let storage = GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?; + let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) .set_compression(config.parquet_upload_compression); @@ -220,18 +224,18 @@ pub async fn worker( let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); let rx_disconnect = rx_disconnect.map(RequestData::from); + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .await + .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(remote_storage_config, rx, parquet_config), - worker_inner( - disconnect_events_storage_config, - rx_disconnect, - parquet_config_disconnect - ) + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) ) .map(|_| ()) } else { - worker_inner(remote_storage_config, rx, parquet_config).await + worker_inner(storage, rx, parquet_config).await } } @@ -247,32 +251,18 @@ struct ParquetConfig { test_remote_failures: u64, } -impl ParquetConfig { - async fn storage( - &self, - storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { - let storage = GenericRemoteStorage::from_config(storage_config) - .await - .context("remote storage init")?; - - #[cfg(any(test, feature = "testing"))] - if self.test_remote_failures > 0 { - return Ok(GenericRemoteStorage::unreliable_wrapper( - storage, - self.test_remote_failures, - )); - } - - Ok(storage) - } -} - async fn worker_inner( - storage_config: RemoteStorageConfig, + storage: GenericRemoteStorage, rx: impl Stream, config: ParquetConfig, ) -> anyhow::Result<()> { + #[cfg(any(test, feature = "testing"))] + let storage = if config.test_remote_failures > 0 { + GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) + } else { + storage + }; + let mut rx = std::pin::pin!(rx); let mut rows = Vec::with_capacity(config.rows_per_group); @@ -295,7 +285,7 @@ async fn worker_inner( } if len > config.file_size || force { last_upload = time::Instant::now(); - let file = upload_parquet(w, len, &storage_config, &config).await?; + let file = upload_parquet(w, len, &storage).await?; w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; len = 0; } @@ -308,7 +298,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _rtchk: Writer = upload_parquet(w, len, &storage_config, &config).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) @@ -350,8 +340,7 @@ where async fn upload_parquet( mut w: SerializedFileWriter>, len: i64, - storage_config: &RemoteStorageConfig, - config: &ParquetConfig, + storage: &GenericRemoteStorage, ) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() @@ -388,15 +377,6 @@ async fn upload_parquet( size, compression, "uploading request parquet file" ); - // A bug in azure-sdk means that the identity-token-file that expires after - // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage - // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh - // the storage token, but the identity token has now expired. - // - // - // To work around this, we recreate the storage every time. - let storage = config.storage(storage_config).await?; - let year = now.year(); let month = now.month(); let day = now.day(); @@ -451,8 +431,8 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, + DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use tokio::sync::mpsc; use tokio::time; @@ -579,11 +559,12 @@ mod tests { timeout: std::time::Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(30), }; - - worker_inner(remote_storage_config, rx, config) + let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await .unwrap(); + worker_inner(storage, rx, config).await.unwrap(); + let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() .filter_map(|entry| entry.ok()) diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index a06943726e..c28ff4789d 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -212,15 +212,15 @@ impl ApiLocks { timeout: Duration, epoch: std::time::Duration, metrics: &'static ApiLockMetrics, - ) -> prometheus::Result { - Ok(Self { + ) -> Self { + Self { name, node_locks: ClashMap::with_shard_amount(shards), config, timeout, epoch, metrics, - }) + } } pub(crate) async fn get_permit(&self, key: &K) -> Result { diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 5883d02b92..8d6b2e96f5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -361,7 +361,8 @@ pub struct EndpointJwksResponse { pub struct JwksSettings { pub id: String, pub jwks_url: url::Url, - pub provider_name: String, + #[serde(rename = "provider_name")] + pub _provider_name: String, pub jwt_audience: Option, pub role_names: Vec, } diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6ca091feb7..141f319567 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,16 +3,16 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; +use http_utils::endpoint::{self, request_span}; +use http_utils::error::ApiError; +use http_utils::json::json_response; +use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; use measured::text::BufferedTextEncoder; use measured::MetricGroup; use metrics::NeonMetrics; use tracing::{info, info_span}; -use utils::http::endpoint::{self, request_span}; -use utils::http::error::ApiError; -use utils::http::json::json_response; -use utils::http::{RouterBuilder, RouterService}; use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index c56474edd7..a9e5fbc85b 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -72,34 +72,36 @@ // List of temporarily allowed lints to unblock beta/nightly. #![allow(unknown_lints)] -pub mod auth; -pub mod cache; -pub mod cancellation; -pub mod compute; -pub mod compute_ctl; -pub mod config; -pub mod console_redirect_proxy; -pub mod context; -pub mod control_plane; -pub mod error; +pub mod binary; + +mod auth; +mod cache; +mod cancellation; +mod compute; +mod compute_ctl; +mod config; +mod console_redirect_proxy; +mod context; +mod control_plane; +mod error; mod ext; -pub mod http; -pub mod intern; -pub mod jemalloc; -pub mod logging; -pub mod metrics; -pub mod parse; -pub mod protocol2; -pub mod proxy; -pub mod rate_limiter; -pub mod redis; -pub mod sasl; -pub mod scram; -pub mod serverless; -pub mod signals; -pub mod stream; -pub mod tls; -pub mod types; -pub mod url; -pub mod usage_metrics; -pub mod waiters; +mod http; +mod intern; +mod jemalloc; +mod logging; +mod metrics; +mod parse; +mod protocol2; +mod proxy; +mod rate_limiter; +mod redis; +mod sasl; +mod scram; +mod serverless; +mod signals; +mod stream; +mod tls; +mod types; +mod url; +mod usage_metrics; +mod waiters; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 25bcc81108..f3447e063e 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -205,7 +205,7 @@ pub enum Protocol { } impl Protocol { - pub fn as_str(&self) -> &'static str { + pub fn as_str(self) -> &'static str { match self { Protocol::Http => "http", Protocol::Ws => "ws", @@ -385,6 +385,7 @@ pub enum Waiting { #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "kind")] +#[allow(clippy::enum_variant_names)] pub enum RedisMsgKind { HSet, HSetMultiple, diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 30d8b83e60..186fece4b2 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,9 +5,6 @@ use pq_proto::CancelKeyData; use tokio::sync::Mutex; use uuid::Uuid; -use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( @@ -79,36 +76,3 @@ impl CancellationPublisher for Arc> { .await } } - -pub struct RedisPublisherClient { - #[allow(dead_code)] - client: ConnectionWithCredentialsProvider, - _region_id: String, - _limiter: GlobalRateLimiter, -} - -impl RedisPublisherClient { - pub fn new( - client: ConnectionWithCredentialsProvider, - region_id: String, - info: &'static [RateBucketInfo], - ) -> anyhow::Result { - Ok(Self { - client, - _region_id: region_id, - _limiter: GlobalRateLimiter::new(info.into()), - }) - } - - #[allow(dead_code)] - pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.connect().await { - Ok(()) => {} - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e); - } - } - Ok(()) - } -} diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 0fb4a8a6cc..edc2935618 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -400,9 +400,9 @@ fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), - #[error("could not connection to postgres in compute")] + #[error("could not connect to postgres in compute")] PostgresConnectionError(#[from] postgres_client::Error), - #[error("could not connection to local-proxy in compute")] + #[error("could not connect to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index d5c948777c..95a28663a5 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use http::{Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use serde::Serialize; -use utils::http::error::ApiError; /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { @@ -59,14 +59,14 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response Response> { Response::builder() .status(status) @@ -92,7 +92,7 @@ impl HttpErrorBody { } } -/// Same as [`utils::http::json::json_response`] +/// Same as [`http_utils::json::json_response`] pub(crate) fn json_response( status: StatusCode, data: T, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6888772362..8289500159 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -28,6 +28,7 @@ use futures::TryFutureExt; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; @@ -41,7 +42,6 @@ use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 3e42787a09..5982fe225d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,21 +8,22 @@ use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; +use serde_json::value::RawValue; use serde_json::Value; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use typed_json::json; use url::Url; -use urlencoding; -use utils::http::error::ApiError; use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; @@ -249,6 +250,50 @@ pub(crate) async fn handle( let mut response = match result { Ok(r) => { ctx.set_success(); + + // Handling the error response from local proxy here + if config.authentication_config.is_auth_broker && r.status().is_server_error() { + let status = r.status(); + + let body_bytes = r + .collect() + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::Error::msg(format!( + "could not collect http body: {e}" + ))) + })? + .to_bytes(); + + if let Ok(mut json_map) = + serde_json::from_slice::>(&body_bytes) + { + let message = json_map.get("message"); + if let Some(message) = message { + let msg: String = match serde_json::from_str(message.get()) { + Ok(msg) => msg, + Err(_) => { + "Unable to parse the response message from server".to_string() + } + }; + + error!("Error response from local_proxy: {status} {msg}"); + + json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys + + let resp_json = serde_json::to_string(&json_map) + .unwrap_or("failed to serialize the response message".to_string()); + + return json_response(status, resp_json); + } + } + + error!("Unable to parse the response message from local_proxy"); + return json_response( + status, + json!({ "message": "Unable to parse the response message from server".to_string() }), + ); + } r } Err(e @ SqlOverHttpError::Cancelled(_)) => { @@ -618,8 +663,6 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { - let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); - let keys = match auth { AuthData::Password(pw) => { backend @@ -634,7 +677,9 @@ async fn handle_db_inner( }; let client = match keys.keys { - ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { + ComputeCredentialKeys::JwtPayload(payload) + if backend.auth_backend.is_local_proxy() => + { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; let (cli_inner, _dsc) = client.client_inner(); cli_inner.set_jwt_session(&payload).await?; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0eb511f1cc..d12ebc1030 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -63,6 +63,7 @@ sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true +http-utils.workspace = true utils.workspace = true wal_decoder.workspace = true env_logger.workspace = true diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml index 6c5a52de3a..0b660aaf32 100644 --- a/safekeeper/client/Cargo.toml +++ b/safekeeper/client/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +http-utils.workspace = true safekeeper_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index f65bfaa6d5..df049f3eba 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -3,11 +3,11 @@ //! Partially copied from pageserver client; some parts might be better to be //! united. +use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus}; use std::error::Error as _; use utils::{ - http::error::HttpErrorBody, id::{NodeId, TenantId, TimelineId}, logging::SecretString, }; diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index d82a713f8a..6e160b7a5e 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -14,7 +14,7 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)?; server.serve(service).await?; Ok(()) // unreachable diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 7ec08ecf9a..a64bf1ddd8 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,3 +1,4 @@ +use http_utils::failpoints::failpoints_handler; use hyper::{Body, Request, Response, StatusCode}; use safekeeper_api::models; use safekeeper_api::models::AcceptorStateStatus; @@ -17,25 +18,23 @@ use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ + +use http_utils::endpoint::{ profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, - ChannelWriter, }; -use utils::http::request::parse_query_param; +use http_utils::{ + endpoint::{self, auth_middleware, check_permission_with, ChannelWriter}, + error::ApiError, + json::{json_request, json_response}, + request::{ensure_no_body, parse_query_param, parse_request_param}, + RequestExt, RouterBuilder, +}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; use utils::{ auth::SwappableJwtAuth, - http::{ - endpoint::{self, auth_middleware, check_permission_with}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_request_param}, - RequestExt, RouterBuilder, - }, id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index ea09ce364d..5916675c3f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; -use tracing::{info_span, Instrument}; +use tracing::{error, info, info_span, Instrument}; +use utils::critical; use utils::lsn::Lsn; use utils::postgres_client::Compression; use utils::postgres_client::InterpretedFormat; @@ -120,6 +121,20 @@ pub enum InterpretedWalReaderError { WalStreamClosed, } +enum CurrentPositionUpdate { + Reset(Lsn), + NotReset(Lsn), +} + +impl CurrentPositionUpdate { + fn current_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset(lsn) => *lsn, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } +} + impl InterpretedWalReaderState { fn current_position(&self) -> Option { match self { @@ -129,6 +144,26 @@ impl InterpretedWalReaderState { InterpretedWalReaderState::Done => None, } } + + // Reset the current position of the WAL reader if the requested starting position + // of the new shard is smaller than the current value. + fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => { + if new_shard_start_pos < *current_position { + *current_position = new_shard_start_pos; + CurrentPositionUpdate::Reset(*current_position) + } else { + CurrentPositionUpdate::NotReset(*current_position) + } + } + InterpretedWalReaderState::Done => { + panic!("maybe_reset called on finished reader") + } + } + } } pub(crate) struct AttachShardNotification { @@ -179,11 +214,10 @@ impl InterpretedWalReader { metric.dec(); } - let res = reader.run_impl(start_pos).await; - if let Err(ref err) = res { - tracing::error!("Task finished with error: {err}"); - } - res + reader + .run_impl(start_pos) + .await + .inspect_err(|err| critical!("failed to read WAL record: {err:?}")) } .instrument(info_span!("interpreted wal reader")), ); @@ -239,11 +273,10 @@ impl InterpretedWalReader { metric.dec(); } - let res = self.run_impl(start_pos).await; - if let Err(err) = res { - tracing::error!("Interpreted wal reader encountered error: {err}"); + if let Err(err) = self.run_impl(start_pos).await { + critical!("failed to read WAL record: {err:?}"); } else { - tracing::info!("Interpreted wal reader exiting"); + info!("interpreted wal reader exiting"); } Err(CopyStreamHandlerEnd::Other(anyhow!( @@ -410,15 +443,24 @@ impl InterpretedWalReader { }; senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos}); - let current_pos = self.state.read().unwrap().current_position().unwrap(); - if start_pos < current_pos { - self.wal_stream.reset(start_pos).await; - wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); - } + + // If the shard is subscribing below the current position the we need + // to update the cursor that tracks where we are at in the WAL + // ([`Self::state`]) and reset the WAL stream itself + // (`[Self::wal_stream`]). This must be done atomically from the POV of + // anything outside the select statement. + let position_reset = self.state.write().unwrap().maybe_reset(start_pos); + match position_reset { + CurrentPositionUpdate::Reset(to) => { + self.wal_stream.reset(to).await; + wal_decoder = WalStreamDecoder::new(to, self.pg_version); + }, + CurrentPositionUpdate::NotReset(_) => {} + }; tracing::info!( "Added shard sender {} with start_pos={} current_pos={}", - ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos + ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position() ); } } @@ -584,7 +626,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) .await .unwrap(); let end_pos = end_watch.get(); @@ -715,7 +757,6 @@ mod tests { const MSG_COUNT: usize = 200; const PG_VERSION: u32 = 17; const SHARD_COUNT: u8 = 2; - const ATTACHED_SHARDS: u8 = 4; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); let env = Env::new(true).unwrap(); @@ -725,9 +766,11 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) - .await - .unwrap(); + let mut next_record_lsns = Vec::default(); + let end_watch = + Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) + .await + .unwrap(); let end_pos = end_watch.get(); let streaming_wal_reader = StreamingWalReader::new( @@ -746,38 +789,71 @@ mod tests { ) .unwrap(); - let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); - let mut batch_receivers = vec![rx]; + struct Sender { + tx: Option>, + rx: tokio::sync::mpsc::Receiver, + shard: ShardIdentity, + start_lsn: Lsn, + received_next_record_lsns: Vec, + } + impl Sender { + fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self { + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + Self { + tx: Some(tx), + rx, + shard, + start_lsn, + received_next_record_lsns: Vec::default(), + } + } + } + + assert!(next_record_lsns.len() > 7); + let start_lsns = vec![ + next_record_lsns[5], + next_record_lsns[1], + next_record_lsns[3], + ]; + let mut senders = start_lsns + .into_iter() + .map(|lsn| Sender::new(lsn, shard_0)) + .collect::>(); + + let first_sender = senders.first_mut().unwrap(); let handle = InterpretedWalReader::spawn( streaming_wal_reader, - start_lsn, - tx, - shard_0, + first_sender.start_lsn, + first_sender.tx.take().unwrap(), + first_sender.shard, PG_VERSION, &Some("pageserver".to_string()), ); - for _ in 0..(ATTACHED_SHARDS - 1) { - let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); - handle.fanout(shard_0, tx, start_lsn).unwrap(); - batch_receivers.push(rx); + for sender in senders.iter_mut().skip(1) { + handle + .fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn) + .unwrap(); } - loop { - let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap(); - for rx in batch_receivers.iter_mut().skip(1) { - let other_batch = rx.recv().await.unwrap(); - - assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn); - assert_eq!( - batch.available_wal_end_lsn, - other_batch.available_wal_end_lsn + for sender in senders.iter_mut() { + loop { + let batch = sender.rx.recv().await.unwrap(); + tracing::info!( + "Sender with start_lsn={} received batch ending at {} with {} records", + sender.start_lsn, + batch.wal_end_lsn, + batch.records.records.len() ); - } - if batch.wal_end_lsn == batch.available_wal_end_lsn { - break; + for rec in batch.records.records { + sender.received_next_record_lsns.push(rec.next_record_lsn); + } + + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } } } @@ -792,5 +868,20 @@ mod tests { } assert!(done); + + for sender in senders { + tracing::info!( + "Validating records received by sender with start_lsn={}", + sender.start_lsn + ); + + assert!(sender.received_next_record_lsns.is_sorted()); + let expected = next_record_lsns + .iter() + .filter(|lsn| **lsn > sender.start_lsn) + .copied() + .collect::>(); + assert_eq!(sender.received_next_record_lsns, expected); + } } } diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 4e851c5b3d..79ceddd366 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -122,6 +122,7 @@ impl Env { start_lsn: Lsn, msg_size: usize, msg_count: usize, + mut next_record_lsns: Option<&mut Vec>, ) -> anyhow::Result { let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE); @@ -130,7 +131,7 @@ impl Env { WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); - let prefix = c"p"; + let prefix = c"neon-file:"; let prefixlen = prefix.to_bytes_with_nul().len(); assert!(msg_size >= prefixlen); let message = vec![0; msg_size - prefixlen]; @@ -139,6 +140,9 @@ impl Env { &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn); for _ in 0..msg_count { let (lsn, record) = walgen.next().unwrap(); + if let Some(ref mut lsns) = next_record_lsns { + lsns.push(lsn); + } let req = AppendRequest { h: AppendRequestHeader { diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 5eb0bd7146..4341f13824 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -14,6 +14,7 @@ use tokio_util::sync::CancellationToken; use utils::id::TenantId; use utils::sync::gate::Gate; +use http_utils::error::ApiError; use std::cmp::max; use std::ops::{Deref, DerefMut}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; @@ -22,7 +23,6 @@ use std::time::Duration; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::{sync::watch, time::Instant}; use tracing::*; -use utils::http::error::ApiError; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, @@ -592,6 +592,8 @@ impl Timeline { assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); + info!("deleting timeline {} from disk", self.ttid); + // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 01c6aff6c3..1ff6a72bce 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -475,6 +475,8 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); timeline.shutdown().await; + info!("timeline {ttid} shut down for deletion"); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index adac6067da..a0dd571a34 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -246,7 +246,7 @@ mod tests { .unwrap(); let resident_tli = tli.wal_residence_guard().await.unwrap(); - let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT) + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) .await .unwrap(); let end_pos = end_watch.get(); diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py new file mode 100644 index 0000000000..a2f553d290 --- /dev/null +++ b/scripts/generate_image_maps.py @@ -0,0 +1,58 @@ +import itertools +import json +import os + +build_tag = os.environ["BUILD_TAG"] +branch = os.environ["BRANCH"] +dev_acr = os.environ["DEV_ACR"] +prod_acr = os.environ["PROD_ACR"] + +components = { + "neon": ["neon"], + "compute": [ + "compute-node-v14", + "compute-node-v15", + "compute-node-v16", + "compute-node-v17", + "vm-compute-node-v14", + "vm-compute-node-v15", + "vm-compute-node-v16", + "vm-compute-node-v17", + ], +} + +registries = { + "dev": [ + "docker.io/neondatabase", + "369495373322.dkr.ecr.eu-central-1.amazonaws.com", + f"{dev_acr}.azurecr.io/neondatabase", + ], + "prod": [ + "093970136003.dkr.ecr.eu-central-1.amazonaws.com", + f"{prod_acr}.azurecr.io/neondatabase", + ], +} + +outputs: dict[str, dict[str, list[str]]] = {} + +target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] +target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] + +for component_name, component_images in components.items(): + for stage in target_stages: + outputs[f"{component_name}-{stage}"] = dict( + [ + ( + f"docker.io/neondatabase/{component_image}:{build_tag}", + [ + f"{combo[0]}/{component_image}:{combo[1]}" + for combo in itertools.product(registries[stage], target_tags) + ], + ) + for component_image in component_images + ] + ) + +with open(os.environ["GITHUB_OUTPUT"], "a") as f: + for key, value in outputs.items(): + f.write(f"{key}={json.dumps(value)}\n") diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index ad2baf56bb..3a5cdf013a 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS results ( flaky BOOLEAN NOT NULL, arch arch DEFAULT 'X64', lfc BOOLEAN DEFAULT false NOT NULL, + sanitizers BOOLEAN DEFAULT false NOT NULL, build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -39,7 +40,7 @@ CREATE TABLE IF NOT EXISTS results ( reference TEXT NOT NULL, revision CHAR(40) NOT NULL, raw JSONB COMPRESSION lz4 NOT NULL, - UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id) + UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id) ); """ @@ -56,6 +57,7 @@ class Row: flaky: bool arch: str lfc: bool + sanitizers: bool build_type: str pg_version: int run_id: int @@ -135,6 +137,7 @@ def ingest_test_result( } arch = parameters.get("arch", "UNKNOWN").strip("'") lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc" + sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled" build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} @@ -149,6 +152,7 @@ def ingest_test_result( flaky=test["flaky"] or test["retriesStatusChange"], arch=arch, lfc=lfc, + sanitizers=sanitizers, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/scripts/push_with_image_map.py b/scripts/push_with_image_map.py new file mode 100644 index 0000000000..c68f6ad407 --- /dev/null +++ b/scripts/push_with_image_map.py @@ -0,0 +1,22 @@ +import json +import os +import subprocess + +image_map = os.getenv("IMAGE_MAP") +if not image_map: + raise ValueError("IMAGE_MAP environment variable is not set") + +try: + parsed_image_map: dict[str, list[str]] = json.loads(image_map) +except json.JSONDecodeError as e: + raise ValueError("Failed to parse IMAGE_MAP as JSON") from e + +for source, targets in parsed_image_map.items(): + for target in targets: + cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if result.returncode != 0: + print(f"Error: {result.stdout}") + raise RuntimeError(f"Command failed: {' '.join(cmd)}") diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 63f43cdf62..91d8098cb9 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -55,6 +55,7 @@ diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connec diesel_migrations = { version = "2.2.0" } scoped-futures = "0.1.4" +http-utils = { path = "../libs/http-utils/" } utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index ac890b008f..1a56116cad 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -8,6 +8,14 @@ use crate::reconciler::ReconcileError; use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; +use http_utils::{ + endpoint::{self, auth_middleware, check_permission_with, request_span}, + error::ApiError, + failpoints::failpoints_handler, + json::{json_request, json_response}, + request::{must_get_query_param, parse_query_param, parse_request_param}, + RequestExt, RouterBuilder, +}; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; @@ -29,20 +37,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; -use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; -use utils::id::{TenantId, TimelineId}; - -use utils::{ - http::{ - endpoint::{self}, - error::ApiError, - json::{json_request, json_response}, - RequestExt, RouterBuilder, - }, - id::NodeId, -}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 659c088d51..07279a67ff 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -320,7 +320,7 @@ async fn async_main() -> anyhow::Result<()> { let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; - let router_service = utils::http::RouterService::new(router).unwrap(); + let router_service = http_utils::RouterService::new(router).unwrap(); // Start HTTP server let server_shutdown = CancellationToken::new(); diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index ee4eb55294..1a15bae365 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -6,9 +6,10 @@ use std::error::Error as _; use std::time::Duration; use tokio_util::sync::CancellationToken; +use http_utils::error::HttpErrorBody; use hyper::Uri; use reqwest::{StatusCode, Url}; -use utils::{backoff, http::error::HttpErrorBody}; +use utils::backoff; #[derive(Debug, Clone)] pub(crate) struct PeerClient { diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index f9e72862ae..106a7b2699 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,9 +1,10 @@ use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; +use http_utils::error::ApiError; use itertools::Itertools; use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; use std::{collections::HashMap, fmt::Debug}; -use utils::{http::error::ApiError, id::NodeId}; +use utils::id::NodeId; /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4028cd7023..6829663a4c 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -61,6 +61,7 @@ use reqwest::StatusCode; use tracing::{instrument, Instrument}; use crate::pageserver_client::PageserverClient; +use http_utils::error::ApiError; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, @@ -81,7 +82,6 @@ use utils::{ completion::Barrier, failpoint_support, generation::Generation, - http::error::ApiError, id::{NodeId, TenantId, TimelineId}, pausable_failpoint, sync::gate::Gate, diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 33f01f80fb..425abef935 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + + # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200 + fut.result() return Response(status=200) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 6e8210e978..cdc162fca2 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -9,21 +9,23 @@ from requests.adapters import HTTPAdapter class EndpointHttpClient(requests.Session): def __init__( self, - port: int, + external_port: int, + internal_port: int, ): super().__init__() - self.port = port + self.external_port: int = external_port + self.internal_port: int = internal_port self.mount("http://", HTTPAdapter()) def dbs_and_roles(self): - res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles") res.raise_for_status() return res.json() def database_schema(self, database: str): res = self.get( - f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}" + f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}" ) res.raise_for_status() return res.text @@ -34,20 +36,20 @@ class EndpointHttpClient(requests.Session): "version": version, "database": database, } - res = self.post(f"http://localhost:{self.port}/extensions", json=body) + res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body) res.raise_for_status() return res.json() def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]): res = self.post( - f"http://localhost:{self.port}/grants", + f"http://localhost:{self.internal_port}/grants", json={"database": database, "schema": schema, "role": role, "privileges": privileges}, ) res.raise_for_status() return res.json() def metrics(self) -> str: - res = self.get(f"http://localhost:{self.port}/metrics") + res = self.get(f"http://localhost:{self.external_port}/metrics") res.raise_for_status() return res.text @@ -62,5 +64,5 @@ class EndpointHttpClient(requests.Session): } ) - res = self.post(f"http://localhost:{self.port}/failpoints", json=body) + res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body) res.raise_for_status() diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 33d422c590..97a5a36814 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -478,7 +478,8 @@ class NeonLocalCli(AbstractNeonCli): self, branch_name: str, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, tenant_id: TenantId, pg_version: PgVersion, endpoint_id: str | None = None, @@ -486,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli): lsn: Lsn | None = None, pageserver_id: int | None = None, allow_multiple=False, + update_catalog: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -501,8 +503,10 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--lsn", str(lsn)]) if pg_port is not None: args.extend(["--pg-port", str(pg_port)]) - if http_port is not None: - args.extend(["--http-port", str(http_port)]) + if external_http_port is not None: + args.extend(["--external-http-port", str(external_http_port)]) + if internal_http_port is not None: + args.extend(["--internal-http-port", str(internal_http_port)]) if endpoint_id is not None: args.append(endpoint_id) if hot_standby: @@ -511,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if update_catalog: + args.extend(["--update-catalog"]) res = self.raw_cli(args) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7c4991ffab..2fa82754ef 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3345,7 +3345,7 @@ class NeonProxy(PgProtocol): metric_collection_interval: str | None = None, ): host = "127.0.0.1" - domain = "proxy.localtest.me" # resolves to 127.0.0.1 + domain = "proxy.local.neon.build" # resolves to 127.0.0.1 super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port) self.domain = domain @@ -3368,7 +3368,7 @@ class NeonProxy(PgProtocol): # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("*.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3569,7 +3569,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1 + self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3586,7 +3586,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3807,7 +3807,8 @@ class Endpoint(PgProtocol, LogUtils): env: NeonEnv, tenant_id: TenantId, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, check_stop_result: bool = True, ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") @@ -3817,7 +3818,8 @@ class Endpoint(PgProtocol, LogUtils): self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port - self.http_port = http_port + self.external_http_port = external_http_port + self.internal_http_port = internal_http_port self.check_stop_result = check_stop_result # passed to endpoint create and endpoint reconfigure self.active_safekeepers: list[int] = list(map(lambda sk: sk.id, env.safekeepers)) @@ -3834,7 +3836,8 @@ class Endpoint(PgProtocol, LogUtils): self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( - port=self.http_port, + external_port=self.external_http_port, + internal_port=self.internal_http_port, ) def create( @@ -3846,6 +3849,7 @@ class Endpoint(PgProtocol, LogUtils): config_lines: list[str] | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + update_catalog: bool = False, ) -> Self: """ Create a new Postgres endpoint. @@ -3866,10 +3870,12 @@ class Endpoint(PgProtocol, LogUtils): lsn=lsn, hot_standby=hot_standby, pg_port=self.pg_port, - http_port=self.http_port, + external_http_port=self.external_http_port, + internal_http_port=self.internal_http_port, pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + update_catalog=update_catalog, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -4258,7 +4264,8 @@ class EndpointFactory: self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) self.num_instances += 1 self.endpoints.append(ep) @@ -4283,12 +4290,14 @@ class EndpointFactory: hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, + update_catalog: bool = False, ) -> Endpoint: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) endpoint_id = endpoint_id or self.env.generate_endpoint_id() @@ -4303,6 +4312,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, + update_catalog=update_catalog, ) def stop_all(self, fail_on_error=True) -> Self: @@ -5122,12 +5132,14 @@ def wait_for_last_flush_lsn( timeline: TimelineId, pageserver_id: int | None = None, auth_token: str | None = None, + last_flush_lsn: Lsn | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" shards = tenant_get_shards(env, tenant, pageserver_id) - last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + if last_flush_lsn is None: + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 1acb1af23b..c33342c89e 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -124,5 +124,8 @@ def pytest_runtest_makereport(*args, **kwargs): allure.dynamic.parameter( "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc" ) + allure.dynamic.parameter( + "__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled" + ) yield diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index d969971a35..4df2b2df2b 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -282,18 +282,35 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: + """ + Gets the latest generation key from a list of keys. + + @param index_keys: A list of keys of different generations, which start with `prefix` + """ + + def parse_gen(key: str) -> int: + shortname = key.split("/")[-1] + generation_str = shortname.removeprefix(prefix).removesuffix(suffix) + try: + return int(generation_str, base=16) + except ValueError: + log.info(f"Ignoring non-matching key: {key}") + return -1 + + if len(keys) == 0: + raise IndexError("No keys found") + + return max(keys, key=parse_gen) + def get_latest_index_key(self, index_keys: list[str]) -> str: """ Gets the latest index file key. @param index_keys: A list of index keys of different generations. """ - - def parse_gen(index_key: str) -> int: - parts = index_key.split("index_part.json-") - return int(parts[-1], base=16) if len(parts) == 2 else -1 - - return max(index_keys, key=parse_gen) + key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys) + return key def download_index_part(self, index_key: str) -> IndexPartDump: """ @@ -306,6 +323,29 @@ class S3Storage: log.info(f"index_part.json: {body}") return IndexPartDump.from_json(json.loads(body)) + def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None: + tenant_prefix = self.tenant_path(tenant_id) + + objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[ + "Contents" + ] + keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1] + try: + manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys) + except IndexError: + log.info( + f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet" + ) + return None + + response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key) + body = response["Body"].read().decode("utf-8") + log.info(f"Downloaded manifest {manifest_key}: {body}") + + manifest = json.loads(body) + assert isinstance(manifest, dict) + return manifest + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index eea0ec2b95..1947a9c3fb 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,8 @@ class Workload: self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + self._configured_pageserver: int | None = None + def branch( self, timeline_id: TimelineId, @@ -92,8 +94,12 @@ class Workload: **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) + self._configured_pageserver = pageserver_id else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + if self._configured_pageserver != pageserver_id: + self._configured_pageserver = pageserver_id + self._endpoint.reconfigure(pageserver_id=pageserver_id) + self._endpoint_config = pageserver_id connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -122,6 +128,7 @@ class Workload: def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) + start = self.expect_rows end = start + n - 1 self.expect_rows += n diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py index d3118eb15a..b55cb68b64 100644 --- a/test_runner/performance/test_ingest_logical_message.py +++ b/test_runner/performance/test_ingest_logical_message.py @@ -76,6 +76,9 @@ def test_ingest_logical_message( log.info("Waiting for Pageserver to catch up") wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn) + recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + endpoint.stop() + # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will # reingest all the WAL from the safekeeper without any other constraints. This gives us a # baseline of how fast the pageserver can ingest this WAL in isolation. @@ -88,7 +91,13 @@ def test_ingest_logical_message( with zenbenchmark.record_duration("pageserver_recover_ingest"): log.info("Recovering WAL into pageserver") client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + wait_for_last_flush_lsn( + env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn + ) + + # Check endpoint can start, i.e. we really recovered + endpoint.start() + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) # Emit metrics. wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py index 704073fe3b..3bf3ef890f 100644 --- a/test_runner/performance/test_lazy_startup.py +++ b/test_runner/performance/test_lazy_startup.py @@ -79,7 +79,9 @@ def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: assert sum == 1000000 # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index f0a0c1f5a2..da62422fca 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path): "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), - "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", + "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", } # Combine the current environment with custom variables env = os.environ.copy() diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index d051717e92..60d8b5be30 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -56,7 +56,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc endpoint.safe_psql("select 1;") # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{i}_wait_for_spec", "sync_safekeepers_ms": f"{i}_sync_safekeepers", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index a4b9eabf8e..07600dd911 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -140,9 +140,11 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_period": "1h", "compaction_threshold": 13, "compaction_upper_limit": 100, + "compaction_l0_first": False, + "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": True, + "l0_flush_wait_upload": False, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", @@ -175,7 +177,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "image_layer_creation_check_threshold": 1, "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", - "timeline_offloading": True, + "timeline_offloading": False, "wal_receiver_protocol_override": { "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f3347b594e..f10872590c 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -689,9 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) # Write some data to trigger compaction - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) + workload.write_rows(32768, upload=False) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index cdc6c0053d..823f2185e4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -314,7 +314,10 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): - ep = env.endpoints.create_start("main") + ep = env.endpoints.create("main") + ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")} + ep.start(env=ep_env) + connstr = ep.connstr() pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -363,7 +366,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r ) # Timeline exists again: restart the endpoint - ep.start() + ep.start(env=ep_env) pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] @@ -471,6 +474,14 @@ HISTORIC_DATA_SETS = [ PgVersion.V16, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", ), + # This dataset created on a pageserver running modern code at time of capture, but configured with no generation. This + # is our regression test that we can load data written without generations in layer file names & indices + HistoricDataSet( + "2025-02-07-nogenerations", + TenantId("e1411ca6562d6ff62419f693a5695d67"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", + ), ] diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 50a922a616..3a08671bbf 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -82,7 +82,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ddl = client.database_schema(database=test_db["name"]) # Check that it looks like a valid PostgreSQL dump - assert "-- PostgreSQL database dump" in ddl + assert "-- PostgreSQL database dump complete" in ddl # Check that it doesn't contain health_check and migration traces. # They are only created in system `postgres` database, so by checking diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py new file mode 100644 index 0000000000..6619548811 --- /dev/null +++ b/test_runner/regress/test_compute_reconfigure.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +def test_compute_reconfigure(neon_simple_env: NeonEnv): + """ + Test that we can change postgresql.conf settings even if + skip_pg_catalog_updates=True is set. + """ + env = neon_simple_env + + TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: " + + endpoint = env.endpoints.create_start("main") + + # Check that the log line prefix is not set + # or different from TEST_LOG_LINE_PREFIX + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] != TEST_LOG_LINE_PREFIX + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "log_line_prefix", + "vartype": "string", + "value": TEST_LOG_LINE_PREFIX, + } + ] + }, + } + ) + endpoint.reconfigure() + + # Check that in logs we see that it was actually reconfigured, + # not restarted or something else. + endpoint.log_contains("INFO request{method=POST uri=/configure") + + # In /configure we only send SIGHUP at the end, so in theory + # it doesn't necessarily mean that Postgres already reloaded + # the new config; and it may race in some envs. + # So we wait until we see the log line that the config was changed. + def check_logs(): + endpoint.log_contains( + f'[test_compute_reconfigure]: LOG: parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"' + ) + + wait_until(check_logs) + + # Check that the log line prefix is set + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] == TEST_LOG_LINE_PREFIX diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 5e06a1d47f..3ac4ed1a3e 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -20,6 +20,9 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import query_scalar, wait_until +@pytest.mark.skip( + reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548" +) @pytest.mark.parametrize( "attach_mode", ["default_generation", "same_generation"], @@ -172,7 +175,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): # force removal of layers from the future tenant_conf = ps_http.tenant_config(tenant_id) generation_before_detach = get_generation_number() - env.pageserver.tenant_detach(tenant_id) + env.pageserver.http_client().tenant_detach(tenant_id) failpoint_deletion_queue = "deletion-queue-before-execute-pause" ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 80e26d9432..8d9aab6848 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -17,11 +17,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por main_branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( main_branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep-basic-main", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -35,11 +37,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por new_branch_name=branch_name, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id=f"ep-{branch_name}", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -59,23 +63,27 @@ def test_neon_two_primary_endpoints_fail( branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep1", tenant_id=env.initial_tenant, pg_version=env.pg_version, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() # ep1 is not running so create will succeed env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep2", tenant_id=env.initial_tenant, pg_version=env.pg_version, diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 7e5bb45242..fa1cd61206 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -12,7 +12,6 @@ of the pageserver are: from __future__ import annotations import os -import re import time from enum import StrEnum @@ -29,7 +28,6 @@ from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, - list_prefix, wait_for_last_record_lsn, wait_for_upload, ) @@ -124,109 +122,6 @@ def assert_deletion_queue(ps_http, size_fn) -> None: assert size_fn(v) is True -def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): - """ - Validate behavior when a pageserver is run without generation support enabled, - then started again after activating it: - - Before upgrade, no objects should have generation suffixes - - After upgrade, the bucket should contain a mixture. - - In both cases, postgres I/O should work. - """ - neon_env_builder.enable_pageserver_remote_storage( - RemoteStorageKind.MOCK_S3, - ) - - env = neon_env_builder.init_configs() - env.broker.start() - for sk in env.safekeepers: - sk.start() - env.storage_controller.start() - - # We will start a pageserver with no control_plane_api set, so it won't be able to self-register - env.storage_controller.node_register(env.pageserver) - - def remove_control_plane_api_field(config): - return config.pop("control_plane_api") - - control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) - env.pageserver.start() - env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) - - env.create_tenant( - tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline - ) - - generate_uploads_and_deletions(env, pageserver=env.pageserver) - - def parse_generation_suffix(key): - m = re.match(".+-([0-9a-zA-Z]{8})$", key) - if m is None: - return None - else: - log.info(f"match: {m}") - log.info(f"group: {m.group(1)}") - return int(m.group(1), 16) - - assert neon_env_builder.pageserver_remote_storage is not None - pre_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in pre_upgrade_keys: - assert parse_generation_suffix(key) is None - - env.pageserver.stop() - # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": control_plane_api, - } - ) - env.pageserver.start() - - generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) - - legacy_objects: list[str] = [] - suffixed_objects = [] - post_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in post_upgrade_keys: - log.info(f"post-upgrade key: {key}") - if parse_generation_suffix(key) is not None: - suffixed_objects.append(key) - else: - legacy_objects.append(key) - - # Bucket now contains a mixture of suffixed and non-suffixed objects - assert len(suffixed_objects) > 0 - assert len(legacy_objects) > 0 - - # Flush through deletions to get a clean state for scrub: we are implicitly validating - # that our generations-enabled pageserver was able to do deletions of layers - # from earlier which don't have a generation. - env.pageserver.http_client().deletion_queue_flush(execute=True) - - assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0 - - # Having written a mixture of generation-aware and legacy index_part.json, - # ensure the scrubber handles the situation as expected. - healthy, metadata_summary = env.storage_scrubber.scan_metadata() - assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline - assert metadata_summary["timeline_count"] == 1 - assert metadata_summary["timeline_shard_count"] == 1 - assert healthy - - def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 835ccbd5d4..21cb780c06 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -3,6 +3,7 @@ from __future__ import annotations import random from contextlib import closing +import psycopg2.errors as pgerr import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -226,3 +227,43 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | N # so instead, do a fast shutdown for this one test. # See https://github.com/neondatabase/neon/issues/8709 env.stop(immediate=True) + + +def test_pageserver_lost_and_transaction_aborted(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction abort and target relation is + not present in cache, we abort the transaction in ABORT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(b box)") + env.pageserver.stop() + cur.execute("create index ti on t using gist(b)") + + +def test_pageserver_lost_and_transaction_committed(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction commit and target relation is + not present in cache, we abort the transaction in COMMIT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(t boolean)") + env.pageserver.stop() + cur.execute("drop table t") diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 2877f14e0e..c5ae669dce 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -120,7 +120,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End # Run the main PostgreSQL regression tests, in src/test/regress. # -@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds +@pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, @@ -194,7 +194,7 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # -@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds +@pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( neon_env_builder: NeonEnvBuilder, @@ -222,6 +222,8 @@ def test_isolation( "max_prepared_transactions=100", # Enable the test mode, so that we don't need to patch the test cases. "neon.regress_test_mode = true", + # Stack size should be increased for tests to pass with asan. + "max_stack_depth = 4MB", ], ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") @@ -417,7 +419,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=5) + fut.result(timeout=15) except: exec.shutdown(wait=False, cancel_futures=True) raise diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index d8df2efc78..3c7fd0b897 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -57,7 +57,7 @@ def test_proxy_select_1(static_proxy: NeonProxy): assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me") + out = static_proxy.safe_psql("select 42", host="generic-project-name.local.neon.build") assert out[0][0] == 42 @@ -234,7 +234,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( - f"https://api.localtest.me:{static_proxy.external_http_port}/sql", + f"https://api.local.neon.build:{static_proxy.external_http_port}/sql", data=json.dumps({"query": "select 42 as answer", "params": []}), headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, verify=str(static_proxy.test_output_dir / "proxy.crt"), diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index 902da1942e..c59da8c6b0 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -35,7 +35,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project") # with SNI - check_cannot_connect(query="select 1", host="private-project.localtest.me") + check_cannot_connect(query="select 1", host="private-project.local.neon.build") # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project") @@ -46,7 +46,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me") + out = static_proxy.safe_psql(query="select 1", host="generic-project.local.neon.build") assert out[0][0] == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 86a6b7428b..8910873690 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -91,7 +91,7 @@ def test_sharding_smoke( workload.init() sizes_before = get_sizes() - workload.write_rows(256) + workload.write_rows(65536) # Test that we can read data back from a sharded tenant workload.validate() @@ -1368,6 +1368,7 @@ def test_sharding_split_failures( workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(100) + compute_reconfigure_listener.register_workload(workload) # Put the environment into a failing state (exact meaning depends on `failure`) failure.apply(env) @@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", ], + # We need `neon` extension for calling backpressure functions, + # this flag instructs `compute_ctl` to pre-install it. + "update_catalog": True, }, ) workload.init() @@ -1810,3 +1814,14 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn + + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] + ) diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 2a26fef59a..3487542d6e 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -116,7 +116,7 @@ def test_pg_sni_router( test_output_dir: Path, ): generate_tls_cert( - "endpoint.namespace.localtest.me", + "endpoint.namespace.local.neon.build", test_output_dir / "router.crt", test_output_dir / "router.key", ) @@ -130,7 +130,7 @@ def test_pg_sni_router( with PgSniRouter( neon_binpath=neon_binpath, port=router_port, - destination="localtest.me", + destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", test_output_dir=test_output_dir, @@ -141,7 +141,7 @@ def test_pg_sni_router( "select 1", dbname="postgres", sslmode="require", - host=f"endpoint--namespace--{pg_port}.localtest.me", + host=f"endpoint--namespace--{pg_port}.local.neon.build", hostaddr="127.0.0.1", ) assert out[0][0] == 1 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 11a4d09202..2750826aec 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2139,12 +2139,18 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("num_azs", [1, 2]) +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before restarting. In practice, Ansible will drive this process. + + Test is parametrized on the number of AZs to exercise the AZ-driven behavior + of reliably moving shards back to their home AZ, and the behavior for AZ-agnostic + tenants where we fill based on a target shard count. """ + neon_env_builder.num_azs = num_azs neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -2174,8 +2180,15 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): min_shard_count = min(shard_counts.values()) max_shard_count = max(shard_counts.values()) - flake_factor = 5 / 100 - assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + if num_azs == 1: + # AZ-agnostic case: we expect all nodes to have the same number of shards, within some bound + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + else: + # AZ-driven case: we expect tenants to have been round-robin allocated to AZs, + # and after the restart they should all be back in their home AZ, so difference + # should be at most a single shard's tenants + assert max_shard_count - min_shard_count <= shard_count_per_tenant # Perform a graceful rolling restart for ps in env.pageservers: diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 0f4e5688a9..b8253fb125 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -312,6 +312,17 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] + ) + def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 7d4f66d044..8ad7282ea2 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -3,12 +3,14 @@ from __future__ import annotations import threading import time +import pytest from fixtures.neon_fixtures import NeonEnv from fixtures.utils import wait_until # This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. # It requires tracking information about replication origins at page server side +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM def test_subscriber_restart(neon_simple_env: NeonEnv): env = neon_simple_env env.create_branch("publisher") diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 306e971657..2706ddf2f0 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -139,9 +139,9 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @pytest.mark.parametrize("manual_offload", [False, True]) def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): - if not manual_offload: - # (automatic) timeline offloading defaults to false for now - neon_env_builder.pageserver_config_override = "timeline_offloading = true" + if manual_offload: + # (automatic) timeline offloading defaults to true + neon_env_builder.pageserver_config_override = "timeline_offloading = false" env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() @@ -396,8 +396,6 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): with tenant migrations and timeline deletions. """ - # Offloading is off by default at time of writing: remove this line when it's on by default - neon_env_builder.pageserver_config_override = "timeline_offloading = true" neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"} neon_env_builder.enable_pageserver_remote_storage(s3_storage()) @@ -554,8 +552,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): log.info(f"Timeline {state.timeline_id} is still active") shutdown.wait(0.5) elif state.timeline_id in offloaded_ids: - log.info(f"Timeline {state.timeline_id} is now offloaded") - state.offloaded = True + log.info(f"Timeline {state.timeline_id} is now offloaded in memory") + + # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload + # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest). + # So we wait until we see the manifest update before considering it offloaded, that way + # subsequent checks that it doesn't revert to active on a restart will pass reliably. + time.sleep(0.1) + assert isinstance(env.pageserver_remote_storage, S3Storage) + manifest = env.pageserver_remote_storage.download_tenant_manifest( + tenant_id + ) + if manifest is None: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)" + ) + elif str(state.timeline_id) in [ + t["timeline_id"] for t in manifest["offloaded_timelines"] + ]: + log.info( + f"Timeline {state.timeline_id} is now offloaded persistently" + ) + state.offloaded = True + else: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})" + ) + break else: # Timeline is neither offloaded nor active, this is unexpected: the pageserver @@ -969,8 +992,6 @@ def test_timeline_offload_race_unarchive( Ensure that unarchive and timeline offload don't race each other """ # Regression test for issue https://github.com/neondatabase/neon/issues/10220 - # (automatic) timeline offloading defaults to false for now - neon_env_builder.pageserver_config_override = "timeline_offloading = true" failpoint = "before-timeline-auto-offload" diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b32b028fa1..936c774657 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -268,7 +268,8 @@ def endpoint_create_start( env, tenant_id=env.initial_tenant, pg_port=env.port_distributor.get_port(), - http_port=env.port_distributor.get_port(), + external_http_port=env.port_distributor.get_port(), + internal_http_port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. check_stop_result=False, diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py index facdb19140..069852468d 100755 --- a/test_runner/websocket_tunnel.py +++ b/test_runner/websocket_tunnel.py @@ -13,12 +13,12 @@ # postgres -D data -p3000 # # ## Launch proxy with WSS enabled: -# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me' +# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.local.neon.build' # ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres # # ## Launch the tunnel: # -# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me" +# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.local.neon.build" # # ## Now you can connect with psql: # psql "postgresql://heikki@localhost:40433/postgres" diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 86d9ea96eb..13cf5d06c9 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c +Subproject commit 13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 8dfd5a7030..4c45d78ad5 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3 +Subproject commit 4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d diff --git a/vendor/revisions.json b/vendor/revisions.json index efddaef46a..5f60e1d690 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,11 +1,11 @@ { "v17": [ "17.2", - "8dfd5a7030d3e8a98b60265ebe045788892ac7f3" + "4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d" ], "v16": [ "16.6", - "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c" + "13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792" ], "v15": [ "15.10", diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 2c65401154..1b7c376560 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -42,7 +42,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } -hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } +hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } @@ -94,6 +94,7 @@ tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } url = { version = "2", features = ["serde"] } +uuid = { version = "1", features = ["serde", "v4", "v7"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" }